AWK 高级应用手册

AWK 高级应用手册

一、网络编程与套接字

1. TCP 客户端通信

# 简单TCP客户端
awk 'BEGIN {
    # 连接到本地8080端口
    server = "/inet/tcp/0/127.0.0.1/8080"
    
    # 发送数据
    print "Hello Server" |& server
    
    # 接收响应
    if ((server |& getline response) > 0) {
        print "Server response:", response
    }
    
    close(server)
}'

2. TCP 服务器

# 简单TCP服务器(监听8080端口)
awk 'BEGIN {
    server = "/inet/tcp/8080/0/0"
    print "Server listening on port 8080..."
    
    while ((server |& getline) > 0) {
        print "Client connected"
        print "Received:", $0
        
        # 回复客户端
        print "Echo: " $0 |& server
        close(server)
    }
}'

3. HTTP 客户端示例

# 简单HTTP GET请求
awk 'BEGIN {
    host = "www.example.com"
    port = 80
    socket = "/inet/tcp/0/" host "/" port
    
    # 构造HTTP请求
    request = "GET / HTTP/1.1\r\n"
    request = request "Host: " host "\r\n"
    request = request "Connection: close\r\n\r\n"
    
    # 发送请求
    print request |& socket
    
    # 读取响应
    while ((socket |& getline) > 0) {
        print $0
    }
    
    close(socket)
}'

二、进程间通信

1. 管道通信

# 与外部命令交互
awk 'BEGIN {
    cmd = "sort -n"
    
    # 发送数据到sort命令
    print "30" |& cmd
    print "10" |& cmd
    print "20" |& cmd
    
    close(cmd, "to")  # 关闭写入端
    
    # 读取排序结果
    while ((cmd |& getline) > 0) {
        print "Sorted:", $0
    }
    
    close(cmd)
}'

2. 协同进程

# 创建协同进程
awk 'BEGIN {
    coproc = "tr a-z A-Z"  # 创建大写转换进程
    
    # 发送数据
    print "hello world" |& coproc
    
    # 读取结果
    if ((coproc |& getline result) > 0) {
        print "Result:", result
    }
    
    close(coproc)
}'

三、动态数据结构

1. 动态数组管理

# 动态数组类实现
awk '
BEGIN {
    # 初始化数组
    init_array("myarray")
    
    # 添加元素
    array_push("myarray", "first")
    array_push("myarray", "second")
    array_push("myarray", "third")
    
    # 获取元素
    print "Element at index 1:", array_get("myarray", 1)
    
    # 数组长度
    print "Array length:", array_length("myarray")
    
    # 遍历数组
    for(i = 0; i < array_length("myarray"); i++) {
        print "Index", i, ":", array_get("myarray", i)
    }
}

function init_array(name) {
    eval(name "_length = 0")
}

function array_push(name, value) {
    eval(name "[" eval(name "_length") "] = \"" value "\"")
    eval(name "_length++")
}

function array_get(name, index) {
    return eval(name "[" index "]")
}

function array_length(name) {
    return eval(name "_length")
}

function eval(expr) {
    return system("echo " expr " | awk '{print " expr "}'")
}'

2. 栈数据结构

# 栈实现
awk '
BEGIN {
    stack_init("mystack")
    stack_push("mystack", "item1")
    stack_push("mystack", "item2")
    stack_push("mystack", "item3")
    
    while(!stack_empty("mystack")) {
        print "Popped:", stack_pop("mystack")
    }
}

function stack_init(name) {
    eval(name "_top = -1")
}

function stack_push(name, item) {
    top = eval(name "_top")
    top++
    eval(name "[" top "] = \"" item "\"")
    eval(name "_top = " top)
}

function stack_pop(name) {
    top = eval(name "_top")
    if(top >= 0) {
        item = eval(name "[" top "]")
        top--
        eval(name "_top = " top)
        return item
    }
    return ""
}

function stack_empty(name) {
    return (eval(name "_top") < 0)
}'

四、文件系统操作

1. 目录遍历

# 递归遍历目录
awk 'BEGIN {
    traverse_directory(".")
}

function traverse_directory(dir) {
    cmd = "find " dir " -type f"
    while ((cmd | getline file) > 0) {
        print "Found file:", file
        # 处理文件...
    }
    close(cmd)
}'

2. 文件监控

# 监控文件变化
awk 'BEGIN {
    file = "monitor.txt"
    last_size = get_file_size(file)
    
    while(1) {
        current_size = get_file_size(file)
        if(current_size != last_size) {
            print "File changed! New size:", current_size
            last_size = current_size
        }
        system("sleep 1")
    }
}

function get_file_size(filename) {
    cmd = "stat -c %s " filename " 2>/dev/null"
    if((cmd | getline size) > 0) {
        close(cmd)
        return size
    }
    close(cmd)
    return 0
}'

五、数据库接口

1. SQLite 集成

# 通过系统调用与SQLite交互
awk 'BEGIN {
    db_file = "data.db"
    
    # 创建表
    system("sqlite3 " db_file " \"CREATE TABLE IF NOT EXISTS users(id INTEGER PRIMARY KEY, name TEXT, age INTEGER)\"")
    
    # 插入数据
    insert_user(db_file, "Alice", 25)
    insert_user(db_file, "Bob", 30)
    
    # 查询数据
    query_users(db_file)
}

function insert_user(db, name, age) {
    cmd = "sqlite3 " db " \"INSERT INTO users(name, age) VALUES (\"\"" name "\"\", " age ")\""
    system(cmd)
}

function query_users(db) {
    cmd = "sqlite3 " db " \"SELECT * FROM users\""
    while((cmd | getline) > 0) {
        print "User:", $0
    }
    close(cmd)
}'

2. CSV 到数据库转换

# CSV导入数据库
awk -F',' '
NR > 1 {
    cmd = "sqlite3 data.db \"INSERT INTO records(col1, col2, col3) VALUES (\"\"" $1 "\"\", \"\"" $2 "\"\", \"\"" $3 "\"\")\""
    system(cmd)
}
' data.csv

六、并发与并行处理

1. 多进程处理

# 并行处理多个文件
awk 'BEGIN {
    files[1] = "file1.txt"
    files[2] = "file2.txt"
    files[3] = "file3.txt"
    
    # 启动多个进程
    for(i in files) {
        cmd = "wc -l " files[i] " > result_" i ".txt &"
        system(cmd)
    }
    
    # 等待所有进程完成
    system("wait")
    
    # 收集结果
    for(i in files) {
        cmd = "cat result_" i ".txt"
        if((cmd | getline) > 0) {
            print "File", files[i], ":", $1, "lines"
        }
        close(cmd)
        system("rm result_" i ".txt")
    }
}'

2. 线程池模拟

# 简单任务队列
awk 'BEGIN {
    max_workers = 3
    current_workers = 0
    
    # 模拟任务队列
    tasks[1] = "task1.sh"
    tasks[2] = "task2.sh"
    tasks[3] = "task3.sh"
    tasks[4] = "task4.sh"
    tasks[5] = "task5.sh"
    
    for(i in tasks) {
        while(current_workers >= max_workers) {
            system("sleep 0.1")  # 等待有空闲工作线程
        }
        
        # 启动任务
        cmd = tasks[i] " &"
        system(cmd)
        current_workers++
        print "Started task:", tasks[i], "(Workers:", current_workers, ")"
    }
    
    # 等待所有任务完成
    system("wait")
    print "All tasks completed"
}'

七、高级正则表达式

1. 复杂模式匹配

# 多行模式匹配
awk '
BEGIN {
    RS = ""  # 空行分隔记录
}
/ERROR/ && /critical/ {
    print "Critical error found:"
    print $0
    print "---"
}
' system.log

# 嵌套结构解析
awk '
BEGIN {
    pattern = "<([a-zA-Z][a-zA-Z0-9]*)[^>]*>([^<]*)</\\1>"
}
{
    while(match($0, pattern, arr)) {
        tag = arr[1]
        content = arr[2]
        print "Tag:", tag, "Content:", content
        $0 = substr($0, RSTART + RLENGTH)
    }
}
' html_file.txt

2. 正则捕获组

# GNU awk 的正则捕获组
awk '{
    if(match($0, /([0-9]{4})-([0-9]{2})-([0-9]{2})/, date_parts)) {
        year = date_parts[1]
        month = date_parts[2]
        day = date_parts[3]
        print "Year:", year, "Month:", month, "Day:", day
    }
}' dates.txt

八、性能分析与优化

1. 性能监控

# 性能分析脚本
awk -p profile.out '
{
    # 实际处理逻辑
    sum += $1
    count++
}
END {
    if(count > 0) {
        print "Average:", sum/count
    }
}
' large_data.txt

# 查看profile.out文件分析性能

2. 内存优化

# 大文件处理优化
awk '
BEGIN {
    # 设置缓冲区大小
    BINMODE = 3
}
{
    # 分批处理,避免内存溢出
    if(NR % 10000 == 0) {
        print "Processed", NR, "lines" > "/dev/stderr"
    }
    
    # 实际处理逻辑
    # ...
}
END {
    print "Total processed:", NR, "lines"
}' huge_file.txt

九、安全编程实践

1. 输入验证

# 安全的输入处理
awk '
function safe_input(str) {
    # 移除危险字符
    gsub(/[;&|$`]/, "", str)
    return str
}

function validate_number(str) {
    return (str ~ /^[0-9]+$/)
}

{
    user_input = safe_input($1)
    if(validate_number(user_input)) {
        print "Valid number:", user_input
    } else {
        print "Invalid input:", user_input > "/dev/stderr"
    }
}' input.txt

2. 沙盒模式

# 使用沙盒模式运行
# gawk -S script.awk data.txt
# 禁止system(), getline等危险函数

awk -S '
BEGIN {
    # 在沙盒模式下,这些操作会被禁止
    # system("rm -rf /")  # 这会失败
    # cmd = "/bin/sh"     # 这也会失败
    print "Running in sandbox mode"
}
{
    print $0
}' data.txt

十、高级实用示例

1. 实时日志监控系统

#!/usr/bin/awk -f
# 实时日志分析器
BEGIN {
    # 配置
    ERROR_THRESHOLD = 10
    WINDOW_SIZE = 60  # 60秒窗口
    
    # 初始化统计
    start_time = systime()
}

{
    # 解析日志行
    timestamp = substr($0, 1, 19)  # 假设前19个字符是时间戳
    if($0 ~ /ERROR|FATAL/) {
        error_count++
        errors[error_count] = $0
    }
    
    # 每分钟报告一次
    current_time = systime()
    if(current_time - start_time >= 60) {
        report_stats()
        start_time = current_time
        delete errors  # 清空窗口数据
        error_count = 0
    }
}

function report_stats() {
    print strftime("%Y-%m-%d %H:%M:%S"), "- Errors in last minute:", error_count
    if(error_count > ERROR_THRESHOLD) {
        print "ALERT: Error threshold exceeded!"
        # 可以发送告警邮件等
    }
}

2. 数据可视化工具

# 简单的文本图表生成器
awk '
BEGIN {
    max_value = 0
}

{
    data[NR] = $1
    if($1 > max_value) max_value = $1
}

END {
    print "Data Visualization:"
    print "=================="
    
    scale = 50 / max_value  # 缩放到50个字符宽度
    
    for(i = 1; i <= NR; i++) {
        bar_length = int(data[i] * scale)
        printf "%3d |", data[i]
        for(j = 1; j <= bar_length; j++) {
            printf "*"
        }
        print ""
    }
}' numbers.txt

3. 配置管理器

# 高级配置文件解析器
awk '
BEGIN {
    # 支持多种配置格式
    current_section = "global"
}

# 处理注释
/^ *#/ { next }

# 处理空行
/^ *$/ { next }

# 处理节头 [section]
/^\[.*\]$/ {
    sub(/^\[/, "")
    sub(/\]$/, "")
    current_section = $0
    next
}

# 处理键值对
/^[a-zA-Z_]/ && /=/ {
    # 支持变量替换
    gsub(/\${([^}]+)}/, "\\$" var_map[substr($0, RSTART+2, RLENGTH-3)])
    
    split($0, parts, "=")
    key = trim(parts[1])
    value = trim(parts[2])
    
    # 存储配置
    config[current_section "/" key] = value
    var_map[key] = value
}

END {
    # 输出所有配置
    for(key in config) {
        print key " = " config[key]
    }
}

function trim(str) {
    gsub(/^ +| +$/, "", str)
    return str
}' config.ini

这些高级应用展示了 AWK 在系统编程、网络通信、数据库集成等方面的强大能力。掌握这些技能后,你可以用 AWK 构建复杂的系统工具和自动化脚本。

此条目发表在linux文章分类目录,贴了标签。将固定链接加入收藏夹。

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注