seccomp系统调用及示例

seccomp 函数详解

1. 函数介绍

seccomp 是Linux系统调用过滤机制,用于限制进程可以执行的系统调用。它通过Berkeley Packet Filter (BPF) 程序来定义哪些系统调用是允许的,哪些是禁止的。seccomp 是构建沙箱环境、提高应用程序安全性的重要工具,可以有效防止恶意代码执行危险的系统调用。

2. 函数原型

#include <linux/seccomp.h>
#include <linux/filter.h>
#include <sys/prctl.h>
#include <unistd.h>

int prctl(int option, unsigned long arg2, unsigned long arg3, 
          unsigned long arg4, unsigned long arg5);

int seccomp(unsigned int operation, unsigned int flags, void *args);

3. 功能

seccomp 提供了系统调用级别的安全控制,可以:

  • 限制进程可执行的系统调用集合
  • 定义系统调用的执行策略(允许、错误、终止)
  • 使用BPF程序实现复杂的过滤逻辑
  • 构建安全的沙箱环境

4. 参数

prctl方式:

  • int option: 控制选项(如PR_SET_SECCOMP)
  • unsigned long arg2: seccomp模式(SECCOMP_MODE_STRICT/SECCOMP_MODE_FILTER)
  • 其他参数: 根据选项而定

seccomp系统调用:

  • unsigned int operation: 操作类型(SECCOMP_SET_MODE_STRICT/SECCOMP_SET_MODE_FILTER)
  • unsigned int flags: 标志位(通常为0)
  • *void args: 操作参数(BPF程序指针等)

5. 返回值

  • 成功: 返回0
  • 失败: 返回-1,并设置errno

6. 相似函数,或关联函数

  • prctl: 进程控制接口
  • personality: 设置进程执行特性
  • chroot: 改变根目录
  • capset: 设置进程权限

7. 示例代码

示例1:基础seccomp使用

#define _GNU_SOURCE
#include <linux/seccomp.h>
#include <linux/filter.h>
#include <sys/prctl.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/syscall.h>

/**
 * 演示基础seccomp使用方法
 */
int demo_seccomp_basic() {
    printf("=== 基础seccomp使用示例 ===\n");
    
    // 显示当前seccomp状态
    int current_mode = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
    printf("当前seccomp模式: ");
    switch (current_mode) {
        case 0:
            printf("SECCOMP_MODE_DISABLED (禁用)\n");
            break;
        case 1:
            printf("SECCOMP_MODE_STRICT (严格模式)\n");
            break;
        case 2:
            printf("SECCOMP_MODE_FILTER (过滤模式)\n");
            break;
        default:
            printf("未知模式 (%d)\n", current_mode);
            break;
    }
    
    // 测试普通系统调用(应该成功)
    printf("测试普通系统调用...\n");
    write(STDOUT_FILENO, "  普通write调用成功\n", 21);
    
    // 启用严格模式seccomp
    printf("启用seccomp严格模式...\n");
    if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0) == -1) {
        printf("启用seccomp失败: %s\n", strerror(errno));
        printf("注意:严格模式只允许read/write/exit/exit_group系统调用\n");
        return -1;
    }
    
    printf("seccomp严格模式启用成功\n");
    printf("当前seccomp模式: %d\n", prctl(PR_GET_SECCOMP, 0, 0, 0, 0));
    
    // 测试允许的系统调用
    printf("测试允许的系统调用...\n");
    write(STDOUT_FILENO, "  write调用仍然允许\n", 20);
    
    // 测试不允许的系统调用(这会导致程序终止)
    printf("测试不允许的系统调用(程序将终止)...\n");
    printf("  尝试调用getpid()...\n");
    
    // 注意:下面的调用会导致程序被SIGKILL终止
    // 为了演示目的,我们注释掉危险操作
    /*
    pid_t pid = getpid();  // 这会导致程序终止!
    printf("getpid()返回: %d\n", pid);
    */
    
    printf("  注意:getpid()等系统调用在严格模式下会被禁止\n");
    printf("  实际执行会导致程序被SIGKILL终止\n");
    
    return 0;
}

int main() {
    return demo_seccomp_basic();
}

示例2:自定义BPF过滤器

#define _GNU_SOURCE
#include <linux/seccomp.h>
#include <linux/filter.h>
#include <linux/audit.h>
#include <sys/prctl.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/syscall.h>

/**
 * 创建允许特定系统调用的BPF过滤器
 */
int demo_custom_bpf_filter() {
    printf("=== 自定义BPF过滤器示例 ===\n");
    
    // 定义BPF过滤器程序
    // 允许的系统调用:read, write, exit, exit_group
    struct sock_filter filter[] = {
        // 加载系统调用号到累加器
        BPF_STMT(BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, nr)),
        
        // 允许 read 系统调用 (SYS_read = 0)
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SYS_read, 0, 1),
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
        
        // 允许 write 系统调用 (SYS_write = 1)
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SYS_write, 0, 1),
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
        
        // 允许 exit 系统调用 (SYS_exit = 60 on x86_64)
#ifdef __x86_64__
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 60, 0, 1),
#elif defined(__i386__)
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 1, 0, 1),
#endif
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
        
        // 允许 exit_group 系统调用
#ifdef __x86_64__
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 231, 0, 1),
#elif defined(__i386__)
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 252, 0, 1),
#endif
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
        
        // 其他系统调用返回EPERM错误
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ERRNO | (EPERM & 0xFFFF)),
    };
    
    struct sock_fprog prog = {
        .len = sizeof(filter) / sizeof(filter[0]),
        .filter = filter,
    };
    
    // 显示过滤器信息
    printf("创建BPF过滤器,允许系统调用:\n");
    printf("  read(%d), write(%d), exit(%d), exit_group(%d)\n", 
#ifdef __x86_64__
           SYS_read, SYS_write, 60, 231
#elif defined(__i386__)
           SYS_read, SYS_write, 1, 252
#endif
    );
    printf("其他系统调用将返回EPERM错误\n");
    
    // 应用BPF过滤器
    if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0) == -1) {
        printf("应用BPF过滤器失败: %s\n", strerror(errno));
        printf("可能的原因:\n");
        printf("  1. 内核不支持seccomp BPF\n");
        printf("  2. 缺少CAP_SYS_ADMIN权限\n");
        printf("  3. 已经设置了seccomp策略\n");
        return -1;
    }
    
    printf("BPF过滤器应用成功\n");
    
    // 测试允许的系统调用
    printf("\n测试允许的系统调用:\n");
    write(STDOUT_FILENO, "  write调用成功\n", 16);
    
    char buffer[10];
    ssize_t bytes_read = read(STDIN_FILENO, buffer, sizeof(buffer));
    if (bytes_read >= 0) {
        printf("  read调用成功\n");
    }
    
    // 测试不允许的系统调用
    printf("\n测试不允许的系统调用:\n");
    long result = syscall(SYS_getpid);
    if (result == -1) {
        printf("  getpid调用被阻止: %s\n", strerror(errno));
    } else {
        printf("  getpid调用意外成功: %ld\n", result);
    }
    
    result = syscall(SYS_open, "/etc/passwd", 0);
    if (result == -1) {
        printf("  open调用被阻止: %s\n", strerror(errno));
    } else {
        printf("  open调用意外成功: %ld\n", result);
    }
    
    printf("\n安全的系统调用仍然可以正常工作\n");
    
    return 0;
}

int main() {
    return demo_custom_bpf_filter();
}

示例3:只读沙箱环境

#define _GNU_SOURCE
#include <linux/seccomp.h>
#include <linux/filter.h>
#include <linux/audit.h>
#include <sys/prctl.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/syscall.h>
#include <fcntl.h>
#include <sys/stat.h>

/**
 * 创建只读沙箱环境的BPF过滤器
 */
int demo_readonly_sandbox() {
    printf("=== 只读沙箱环境示例 ===\n");
    
    // 定义只读沙箱的BPF过滤器
    // 允许读操作和基本系统调用,禁止写操作
    struct sock_filter filter[] = {
        // 加载系统调用号
        BPF_STMT(BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, nr)),
        
        // 允许 read 系统调用
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SYS_read, 0, 1),
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
        
        // 允许 write 系统调用(仅允许写到stdout/stderr)
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SYS_write, 0, 5),
        // 检查文件描述符是否为stdout(1)或stderr(2)
        BPF_STMT(BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])),
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 1, 0, 1),  // stdout
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 2, 0, 1),  // stderr
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ERRNO | (EPERM & 0xFFFF)),
        
        // 允许 exit 和 exit_group
#ifdef __x86_64__
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 60, 0, 1),   // exit
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 231, 0, 1),  // exit_group
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
#elif defined(__i386__)
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 1, 0, 1),    // exit
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 252, 0, 1),  // exit_group
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
#endif
        
        // 允许 read-only 文件操作
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SYS_open, 0, 3),
        // 检查打开标志是否包含O_RDONLY
        BPF_STMT(BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[1])),
        BPF_JUMP(BPF_JMP | BPF_JSET | BPF_K, O_RDONLY, 0, 1),
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ERRNO | (EPERM & 0xFFFF)),
        
        // 允许 close 系统调用
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SYS_close, 0, 1),
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
        
        // 禁止其他所有系统调用
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ERRNO | (EPERM & 0xFFFF)),
    };
    
    struct sock_fprog prog = {
        .len = sizeof(filter) / sizeof(filter[0]),
        .filter = filter,
    };
    
    printf("创建只读沙箱环境\n");
    printf("允许的操作:\n");
    printf("  - 读取文件(只读模式)\n");
    printf("  - 写入标准输出和标准错误\n");
    printf("  - 基本的进程控制\n");
    printf("禁止的操作:\n");
    printf("  - 写入文件\n");
    printf("  - 网络操作\n");
    printf("  - 进程创建\n");
    printf("  - 其他危险操作\n");
    
    // 应用过滤器
    if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0) == -1) {
        printf("创建沙箱失败: %s\n", strerror(errno));
        return -1;
    }
    
    printf("只读沙箱创建成功\n");
    
    // 测试沙箱功能
    printf("\n=== 沙箱功能测试 ===\n");
    
    // 测试允许的读操作
    printf("1. 测试允许的读操作:\n");
    int fd = open("/etc/passwd", O_RDONLY);
    if (fd != -1) {
        char buffer[100];
        ssize_t bytes = read(fd, buffer, sizeof(buffer));
        if (bytes > 0) {
            printf("  读取/etc/passwd成功 (%zd 字节)\n", bytes);
        }
        close(fd);
    } else {
        printf("  打开/etc/passwd失败: %s\n", strerror(errno));
    }
    
    // 测试允许的写操作(stdout/stderr)
    printf("\n2. 测试允许的写操作:\n");
    write(STDOUT_FILENO, "  写入stdout成功\n", 17);
    write(STDERR_FILENO, "  写入stderr成功\n", 17);
    
    // 测试禁止的写操作
    printf("\n3. 测试禁止的写操作:\n");
    fd = open("/tmp/test_seccomp", O_WRONLY | O_CREAT | O_TRUNC, 0644);
    if (fd == -1) {
        printf("  创建文件被阻止: %s\n", strerror(errno));
    } else {
        printf("  创建文件意外成功\n");
        close(fd);
        unlink("/tmp/test_seccomp");
    }
    
    // 测试禁止的系统调用
    printf("\n4. 测试禁止的系统调用:\n");
    long result = syscall(SYS_fork);
    if (result == -1) {
        printf("  fork被阻止: %s\n", strerror(errno));
    }
    
    result = syscall(SYS_socket, AF_INET, SOCK_STREAM, 0);
    if (result == -1) {
        printf("  socket被阻止: %s\n", strerror(errno));
    }
    
    printf("\n沙箱环境测试完成\n");
    
    return 0;
}

int main() {
    return demo_readonly_sandbox();
}

示例4:进程监控和日志

#define _GNU_SOURCE
#include <linux/seccomp.h>
#include <linux/filter.h>
#include <linux/audit.h>
#include <sys/prctl.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/syscall.h>
#include <signal.h>
#include <sys/wait.h>

/**
 * 信号处理函数
 */
void signal_handler(int sig) {
    printf("捕获信号 %d\n", sig);
    if (sig == SIGSYS) {
        printf("检测到被禁止的系统调用\n");
    }
}

/**
 * 演示seccomp的监控和日志功能
 */
int demo_seccomp_monitoring() {
    printf("=== seccomp监控和日志示例 ===\n");
    
    // 注册信号处理程序来捕获SIGSYS
    signal(SIGSYS, signal_handler);
    
    // 创建带日志的BPF过滤器
    struct sock_filter filter[] = {
        // 加载系统调用号
        BPF_STMT(BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, nr)),
        
        // 允许基本的读写操作
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SYS_read, 0, 1),
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
        
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SYS_write, 0, 1),
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
        
        // 允许exit相关调用
#ifdef __x86_64__
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 60, 0, 1),   // exit
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 231, 0, 1),  // exit_group
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
#endif
        
        // 对于其他系统调用,返回追踪标志(用于日志)
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRACE | (1 & 0xFFFF)),
    };
    
    struct sock_fprog prog = {
        .len = sizeof(filter) / sizeof(filter[0]),
        .filter = filter,
    };
    
    printf("创建带监控的日志过滤器\n");
    printf("SECCOMP_RET_TRACE可以用于:\n");
    printf("  - 系统调用追踪\n");
    printf("  - 安全审计\n");
    printf("  - 调试和分析\n");
    
    // 启用seccomp
    if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) == -1) {
        printf("启用seccomp失败: %s\n", strerror(errno));
        return -1;
    }
    
    printf("seccomp监控启用成功\n");
    
    // 测试监控功能
    printf("\n测试监控功能:\n");
    
    // 允许的系统调用
    write(STDOUT_FILENO, "允许的write调用\n", 17);
    
    // 被监控的系统调用
    printf("测试被监控的系统调用:\n");
    
    pid_t pid = getpid();
    printf("getpid()返回: %d\n", (int)pid);
    
    uid_t uid = getuid();
    printf("getuid()返回: %d\n", (int)uid);
    
    printf("注意:在实际应用中,SECCOMP_RET_TRACE会触发ptrace事件\n");
    printf("这需要额外的监控进程来处理追踪事件\n");
    
    return 0;
}

int main() {
    return demo_seccomp_monitoring();
}

示例5:安全沙箱应用

#define _GNU_SOURCE
#include <linux/seccomp.h>
#include <linux/filter.h>
#include <linux/audit.h>
#include <sys/prctl.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/syscall.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/mman.h>

/**
 * 安全沙箱配置
 */
typedef struct {
    int allow_network;
    int allow_file_write;
    int allow_process_creation;
    int allow_memory_mapping;
} sandbox_config_t;

/**
 * 创建安全沙箱
 */
int create_secure_sandbox(const sandbox_config_t *config) {
    printf("=== 创建安全沙箱 ===\n");
    
    // 根据配置创建BPF过滤器
    struct sock_filter filter[100];
    int filter_index = 0;
    
    // 基础加载系统调用号指令
    filter[filter_index++] = BPF_STMT(BPF_LD | BPF_W | BPF_ABS, 
                                     offsetof(struct seccomp_data, nr));
    
    // 始终允许的系统调用
    int essential_calls[] = {SYS_read, SYS_write, 
#ifdef __x86_64__
                           60,  // exit
                           231  // exit_group
#elif defined(__i386__)
                           1,   // exit
                           252  // exit_group
#endif
    };
    
    for (size_t i = 0; i < sizeof(essential_calls)/sizeof(essential_calls[0]); i++) {
        filter[filter_index++] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 
                                         essential_calls[i], 0, 1);
        filter[filter_index++] = BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW);
    }
    
    // 根据配置允许额外的系统调用
    if (config->allow_file_write) {
        filter[filter_index++] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SYS_open, 0, 1);
        filter[filter_index++] = BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW);
        
        filter[filter_index++] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SYS_openat, 0, 1);
        filter[filter_index++] = BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW);
        
        filter[filter_index++] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SYS_close, 0, 1);
        filter[filter_index++] = BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW);
    }
    
    if (config->allow_network) {
        filter[filter_index++] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SYS_socket, 0, 1);
        filter[filter_index++] = BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW);
        
        filter[filter_index++] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SYS_connect, 0, 1);
        filter[filter_index++] = BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW);
    }
    
    if (config->allow_process_creation) {
        filter[filter_index++] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SYS_fork, 0, 1);
        filter[filter_index++] = BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW);
        
        filter[filter_index++] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SYS_clone, 0, 1);
        filter[filter_index++] = BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW);
    }
    
    if (config->allow_memory_mapping) {
        filter[filter_index++] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SYS_mmap, 0, 1);
        filter[filter_index++] = BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW);
        
        filter[filter_index++] = BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SYS_munmap, 0, 1);
        filter[filter_index++] = BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW);
    }
    
    // 默认拒绝所有其他系统调用
    filter[filter_index++] = BPF_STMT(BPF_RET | BPF_K, 
                                     SECCOMP_RET_ERRNO | (EPERM & 0xFFFF));
    
    struct sock_fprog prog = {
        .len = filter_index,
        .filter = filter,
    };
    
    printf("沙箱配置:\n");
    printf("  网络访问: %s\n", config->allow_network ? "允许" : "禁止");
    printf("  文件写入: %s\n", config->allow_file_write ? "允许" : "禁止");
    printf("  进程创建: %s\n", config->allow_process_creation ? "允许" : "禁止");
    printf("  内存映射: %s\n", config->allow_memory_mapping ? "允许" : "禁止");
    
    // 应用沙箱
    if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0) == -1) {
        printf("创建沙箱失败: %s\n", strerror(errno));
        return -1;
    }
    
    printf("安全沙箱创建成功\n");
    return 0;
}

/**
 * 演示不同安全级别的沙箱
 */
int demo_security_levels() {
    sandbox_config_t configs[3] = {
        // 最严格:只允许基本I/O
        {0, 0, 0, 0},
        
        // 中等:允许文件操作
        {0, 1, 0, 1},
        
        // 宽松:允许网络和进程创建
        {1, 1, 1, 1}
    };
    
    const char *level_names[] = {"最高安全", "中等安全", "较低安全"};
    
    printf("=== 不同安全级别沙箱演示 ===\n");
    
    for (int level = 0; level < 3; level++) {
        printf("\n--- %s级别沙箱 ---\n", level_names[level]);
        
        if (create_secure_sandbox(&configs[level]) == 0) {
            printf("沙箱 %s 创建成功\n", level_names[level]);
            
            // 测试沙箱功能
            write(STDOUT_FILENO, "基本I/O测试成功\n", 17);
            
            if (configs[level].allow_network) {
                printf("网络功能可用\n");
            }
            
            if (configs[level].allow_file_write) {
                printf("文件写入功能可用\n");
            }
            
            // 由于seccomp策略一旦设置就不能放松,我们需要在子进程中测试
            break;  // 只测试第一个配置
        }
    }
    
    return 0;
}

/**
 * 演示沙箱的实际应用
 */
int demo_practical_sandbox() {
    printf("=== 实际沙箱应用演示 ===\n");
    
    // 创建一个限制性的沙箱:只允许基本操作
    sandbox_config_t config = {0, 0, 0, 0};  // 最严格
    
    if (create_secure_sandbox(&config) != 0) {
        return -1;
    }
    
    printf("\n沙箱环境中运行测试程序:\n");
    
    // 测试基本功能
    printf("1. 基本输出测试:\n");
    printf("   标准输出工作正常\n");
    write(STDOUT_FILENO, "   write系统调用工作正常\n", 24);
    
    // 测试被限制的功能
    printf("\n2. 被限制功能测试:\n");
    
    // 尝试网络操作
    long result = syscall(SYS_socket, AF_INET, SOCK_STREAM, 0);
    if (result == -1) {
        printf("   网络操作被成功阻止: %s\n", strerror(errno));
    }
    
    // 尝试文件写入
    result = syscall(SYS_open, "/tmp/test", O_WRONLY | O_CREAT, 0644);
    if (result == -1) {
        printf("   文件写入被成功阻止: %s\n", strerror(errno));
    }
    
    // 尝试进程创建
    result = syscall(SYS_fork);
    if (result == -1) {
        printf("   进程创建被成功阻止: %s\n", strerror(errno));
    }
    
    printf("\n3. 沙箱优势:\n");
    printf("   ✓ 防止恶意代码执行危险操作\n");
    printf("   ✓ 限制程序的权限范围\n");
    printf("   ✓ 提供额外的安全层\n");
    printf("   ✓ 可以与其它安全机制配合使用\n");
    
    printf("\n4. 使用场景:\n");
    printf("   - 插件或扩展的安全执行\n");
    printf("   - 不可信代码的沙箱运行\n");
    printf("   - 容器和虚拟化环境\n");
    printf("   - 安全审计和监控\n");
    
    return 0;
}

int main() {
    printf("seccomp - Linux系统调用过滤机制\n");
    printf("================================\n\n");
    
    // 由于seccomp策略一旦设置就会影响整个进程,
    // 我们分别在不同的子进程中演示不同功能
    
    if (fork() == 0) {
        return demo_practical_sandbox();
    }
    
    int status;
    wait(&status);
    
    return 0;
}

seccomp 使用注意事项

系统要求:

  1. 内核版本: 需要Linux 3.5或更高版本
  2. 架构支持: 支持多种CPU架构
  3. 编译选项: 需要内核编译时启用CONFIG_SECCOMP

权限要求:

1. CAP_SYS_ADMIN: 通常需要管理员权限
2. 无特权进程: 可以使用SECCOMP_MODE_STRICT
3. 容器环境: Docker等容器可能有限制

安全考虑:

1. 策略不可逆: 一旦应用,seccomp策略不能放松
2. 调试困难: 被阻止的系统调用可能难以调试
3. 兼容性: 可能影响程序的正常功能
4. 性能影响: BPF过滤会增加系统调用开销

最佳实践:

  1. 渐进式应用: 从宽松策略开始,逐步收紧
  2. 充分测试: 在生产环境前充分测试
  3. 错误处理: 妥善处理被阻止的系统调用
  4. 日志记录: 记录安全相关事件
  5. 备份方案: 提供策略失效时的处理方案

seccomp 模式详解

SECCOMP_MODE_STRICT (模式1):

  • 特点: 最简单的模式,只允许read/write/exit/exit_group
  • 优点: 简单、高效、安全
  • 缺点: 功能极其有限
  • 适用: 极度安全要求的简单程序

SECCOMP_MODE_FILTER (模式2):

  • 特点: 使用BPF程序定义复杂过滤规则
  • 优点: 灵活、功能强大
  • 缺点: 配置复杂
  • 适用: 大多数实际应用场景

常见系统调用编号

x86_64架构:

  • SYS_read = 0
  • SYS_write = 1
  • SYS_open = 2
  • SYS_close = 3
  • SYS_stat = 4
  • SYS_fstat = 5
  • SYS_lstat = 6
  • SYS_poll = 7
  • SYS_lseek = 8
  • SYS_mmap = 9
  • SYS_mprotect = 10
  • SYS_munmap = 11
  • SYS_brk = 12
  • SYS_rt_sigaction = 13
  • SYS_rt_sigprocmask = 14
  • SYS_rt_sigreturn = 15
  • SYS_ioctl = 16
  • SYS_pread64 = 17
  • SYS_pwrite64 = 18
  • SYS_readv = 19
  • SYS_writev = 20
  • SYS_access = 21
  • SYS_pipe = 22
  • SYS_select = 23
  • SYS_sched_yield = 24
  • SYS_mremap = 25
  • SYS_msync = 26
  • SYS_mincore = 27
  • SYS_madvise = 28
  • SYS_shmget = 29
  • SYS_shmat = 30
  • SYS_shmctl = 31
  • SYS_dup = 32
  • SYS_dup2 = 33
  • SYS_pause = 34
  • SYS_nanosleep = 35
  • SYS_getitimer = 36
  • SYS_alarm = 37
  • SYS_setitimer = 38
  • SYS_getpid = 39
  • SYS_sendfile = 40
  • SYS_socket = 41
  • SYS_connect = 42
  • SYS_accept = 43
  • SYS_sendto = 44
  • SYS_recvfrom = 45
  • SYS_sendmsg = 46
  • SYS_recvmsg = 47
  • SYS_shutdown = 48
  • SYS_bind = 49
  • SYS_listen = 50
  • SYS_getsockname = 51
  • SYS_getpeername = 52
  • SYS_socketpair = 53
  • SYS_setsockopt = 54
  • SYS_getsockopt = 55
  • SYS_clone = 56
  • SYS_fork = 57
  • SYS_vfork = 58
  • SYS_execve = 59
  • SYS_exit = 60
  • SYS_wait4 = 61
  • SYS_kill = 62
  • SYS_uname = 63
  • SYS_semget = 64
  • SYS_semop = 65
  • SYS_semctl = 66
  • SYS_shmdt = 67
  • SYS_msgget = 68
  • SYS_msgsnd = 69
  • SYS_msgrcv = 70
  • SYS_msgctl = 71
  • SYS_fcntl = 72
  • SYS_flock = 73
  • SYS_fsync = 74
  • SYS_fdatasync = 75
  • SYS_truncate = 76
  • SYS_ftruncate = 77
  • SYS_getdents = 78
  • SYS_getcwd = 79
  • SYS_chdir = 80
  • SYS_fchdir = 81
  • SYS_rename = 82
  • SYS_mkdir = 83
  • SYS_rmdir = 84
  • SYS_creat = 85
  • SYS_link = 86
  • SYS_unlink = 87
  • SYS_symlink = 88
  • SYS_readlink = 89
  • SYS_chmod = 90
  • SYS_fchmod = 91
  • SYS_chown = 92
  • SYS_fchown = 93
  • SYS_lchown = 94
  • SYS_umask = 95
  • SYS_gettimeofday = 96
  • SYS_getrlimit = 97
  • SYS_getrusage = 98
  • SYS_sysinfo = 99
  • SYS_times = 100
  • SYS_ptrace = 101
  • SYS_getuid = 102
  • SYS_syslog = 103
  • SYS_getgid = 104
  • SYS_setuid = 105
  • SYS_setgid = 106
  • SYS_geteuid = 107
  • SYS_getegid = 108
  • SYS_setpgid = 109
  • SYS_getppid = 110
  • SYS_getpgrp = 111
  • SYS_setsid = 112
  • SYS_setreuid = 113
  • SYS_setregid = 114
  • SYS_getgroups = 115
  • SYS_setgroups = 116
  • SYS_setresuid = 117
  • SYS_getresuid = 118
  • SYS_setresgid = 119
  • SYS_getresgid = 120
  • SYS_getpgid = 121
  • SYS_setfsuid = 122
  • SYS_setfsgid = 123
  • SYS_getsid = 124
  • SYS_capget = 125
  • SYS_capset = 126
  • SYS_rt_sigpending = 127
  • SYS_rt_sigtimedwait = 128
  • SYS_rt_sigqueueinfo = 129
  • SYS_rt_sigsuspend = 130
  • SYS_sigaltstack = 131
  • SYS_utime = 132
  • SYS_mknod = 133
  • SYS_uselib = 134
  • SYS_personality = 135
  • SYS_ustat = 136
  • SYS_statfs = 137
  • SYS_fstatfs = 138
  • SYS_sysfs = 139
  • SYS_getpriority = 140
  • SYS_setpriority = 141
  • SYS_sched_setparam = 142
  • SYS_sched_getparam = 143
  • SYS_sched_setscheduler = 144
  • SYS_sched_getscheduler = 145
  • SYS_sched_get_priority_max = 146
  • SYS_sched_get_priority_min = 147
  • SYS_sched_rr_get_interval = 148
  • SYS_mlock = 149
  • SYS_munlock = 150
  • SYS_mlockall = 151
  • SYS_munlockall = 152
  • SYS_vhangup = 153
  • SYS_modify_ldt = 154
  • SYS_pivot_root = 155
  • SYS__sysctl = 156
  • SYS_prctl = 157
  • SYS_arch_prctl = 158
  • SYS_adjtimex = 159
  • SYS_setrlimit = 160
  • SYS_chroot = 161
  • SYS_sync = 162
  • SYS_acct = 163
  • SYS_settimeofday = 164
  • SYS_mount = 165
  • SYS_umount2 = 166
  • SYS_swapon = 167
  • SYS_swapoff = 168
  • SYS_reboot = 169
  • SYS_sethostname = 170
  • SYS_setdomainname = 171
  • SYS_iopl = 172
  • SYS_ioperm = 173
  • SYS_create_module = 174
  • SYS_init_module = 175
  • SYS_delete_module = 176
  • SYS_get_kernel_syms = 177
  • SYS_query_module = 178
  • SYS_quotactl = 179
  • SYS_nfsservctl = 180
  • SYS_getpmsg = 181
  • SYS_putpmsg = 182
  • SYS_afs_syscall = 183
  • SYS_tuxcall = 184
  • SYS_security = 185
  • SYS_gettid = 186
  • SYS_readahead = 187
  • SYS_setxattr = 188
  • SYS_lsetxattr = 189
  • SYS_fsetxattr = 190
  • SYS_getxattr = 191
  • SYS_lgetxattr = 192
  • SYS_fgetxattr = 193
  • SYS_listxattr = 194
  • SYS_llistxattr = 195
  • SYS_flistxattr = 196
  • SYS_removexattr = 197
  • SYS_lremovexattr = 198
  • SYS_fremovexattr = 199
  • SYS_tkill = 200
  • SYS_time = 201
  • SYS_futex = 202
  • SYS_sched_setaffinity = 203
  • SYS_sched_getaffinity = 204
  • SYS_set_thread_area = 205
  • SYS_io_setup = 206
  • SYS_io_destroy = 207
  • SYS_io_getevents = 208
  • SYS_io_submit = 209
  • SYS_io_cancel = 210
  • SYS_get_thread_area = 211
  • SYS_lookup_dcookie = 212
  • SYS_epoll_create = 213
  • SYS_epoll_ctl_old = 214
  • SYS_epoll_wait_old = 215
  • SYS_remap_file_pages = 216
  • SYS_getdents64 = 217
  • SYS_set_tid_address = 218
  • SYS_restart_syscall = 219
  • SYS_semtimedop = 220
  • SYS_fadvise64 = 221
  • SYS_timer_create = 222
  • SYS_timer_settime = 223
  • SYS_timer_gettime = 224
  • SYS_timer_getoverrun = 225
  • SYS_timer_delete = 226
  • SYS_clock_settime = 227
  • SYS_clock_gettime = 228
  • SYS_clock_getres = 229
  • SYS_clock_nanosleep = 230
  • SYS_exit_group = 231
  • SYS_epoll_wait = 232
  • SYS_epoll_ctl = 233
  • SYS_tgkill = 234
  • SYS_utimes = 235
  • SYS_vserver = 236
  • SYS_mbind = 237
  • SYS_set_mempolicy = 238
  • SYS_get_mempolicy = 239
  • SYS_mq_open = 240
  • SYS_mq_unlink = 241
  • SYS_mq_timedsend = 242
  • SYS_mq_timedreceive = 243
  • SYS_mq_notify = 244
  • SYS_mq_getsetattr = 245
  • SYS_kexec_load = 246
  • SYS_waitid = 247
  • SYS_add_key = 248
  • SYS_request_key = 249
  • SYS_keyctl = 250
  • SYS_ioprio_set = 251
  • SYS_ioprio_get = 252
  • SYS_inotify_init = 253
  • SYS_inotify_add_watch = 254
  • SYS_inotify_rm_watch = 255
  • SYS_migrate_pages = 256
  • SYS_openat = 257
  • SYS_mkdirat = 258
  • SYS_mknodat = 259
  • SYS_fchownat = 260
  • SYS_futimesat = 261
  • SYS_newfstatat = 262
  • SYS_unlinkat = 263
  • SYS_renameat = 264
  • SYS_linkat = 265
  • SYS_symlinkat = 266
  • SYS_readlinkat = 267
  • SYS_fchmodat = 268
  • SYS_faccessat = 269
  • SYS_pselect6 = 270
  • SYS_ppoll = 271
  • SYS_unshare = 272
  • SYS_set_robust_list = 273
  • SYS_get_robust_list = 274
  • SYS_splice = 275
  • SYS_tee = 276
  • SYS_sync_file_range = 277
  • SYS_vmsplice = 278
  • SYS_move_pages = 279
  • SYS_utimensat = 280
  • SYS_epoll_pwait = 281
  • SYS_signalfd = 282
  • SYS_timerfd_create = 283
  • SYS_eventfd = 284
  • SYS_fallocate = 285
  • SYS_timerfd_settime = 286
  • SYS_timerfd_gettime = 287
  • SYS_accept4 = 288
  • SYS_signalfd4 = 289
  • SYS_eventfd2 = 290
  • SYS_epoll_create1 = 291
  • SYS_dup3 = 292
  • SYS_pipe2 = 293
  • SYS_inotify_init1 = 294
  • SYS_preadv = 295
  • SYS_pwritev = 296
  • SYS_rt_tgsigqueueinfo = 297
  • SYS_perf_event_open = 298
  • SYS_recvmmsg = 299
  • SYS_fanotify_init = 300
  • SYS_fanotify_mark = 301
  • SYS_prlimit64 = 302
  • SYS_name_to_handle_at = 303
  • SYS_open_by_handle_at = 304
  • SYS_clock_adjtime = 305
  • SYS_syncfs = 306
  • SYS_sendmmsg = 307
  • SYS_setns = 308
  • SYS_getcpu = 309
  • SYS_process_vm_readv = 310
  • SYS_process_vm_writev = 311
  • SYS_kcmp = 312
  • SYS_finit_module = 313
  • SYS_sched_setattr = 314
  • SYS_sched_getattr = 315
  • SYS_renameat2 = 316
  • SYS_seccomp = 317
  • SYS_getrandom = 318
  • SYS_memfd_create = 319
  • SYS_kexec_file_load = 320
  • SYS_bpf = 321
  • SYS_execveat = 322
  • SYS_userfaultfd = 323
  • SYS_membarrier = 324
  • SYS_mlock2 = 325
  • SYS_copy_file_range = 326
  • SYS_preadv2 = 327
  • SYS_pwritev2 = 328
  • SYS_pkey_mprotect = 329
  • SYS_pkey_alloc = 330
  • SYS_pkey_free = 331
  • SYS_statx = 332
  • SYS_io_pgetevents = 333
  • SYS_rseq = 334

总结

seccomp 是Linux系统中强大的安全机制,提供了:

1. 系统调用级别的访问控制: 精确控制进程可以执行的操作
2. 灵活的策略定义: 通过BPF程序实现复杂过滤逻辑
3. 高效的执行: 内核级别的过滤,性能开销小
4. 广泛的应用场景: 适用于沙箱、容器、安全审计等

通过合理使用seccomp,可以显著提高应用程序的安全性,构建更加安全可靠的计算环境。在实际应用中,需要仔细设计过滤策略,充分测试,并考虑错误处理和调试需求。

此条目发表在linux文章分类目录。将固定链接加入收藏夹。

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注