AWK 高级应用手册
一、网络编程与套接字
1. TCP 客户端通信
# 简单TCP客户端
awk 'BEGIN {
# 连接到本地8080端口
server = "/inet/tcp/0/127.0.0.1/8080"
# 发送数据
print "Hello Server" |& server
# 接收响应
if ((server |& getline response) > 0) {
print "Server response:", response
}
close(server)
}'
2. TCP 服务器
# 简单TCP服务器(监听8080端口)
awk 'BEGIN {
server = "/inet/tcp/8080/0/0"
print "Server listening on port 8080..."
while ((server |& getline) > 0) {
print "Client connected"
print "Received:", $0
# 回复客户端
print "Echo: " $0 |& server
close(server)
}
}'
3. HTTP 客户端示例
# 简单HTTP GET请求
awk 'BEGIN {
host = "www.example.com"
port = 80
socket = "/inet/tcp/0/" host "/" port
# 构造HTTP请求
request = "GET / HTTP/1.1\r\n"
request = request "Host: " host "\r\n"
request = request "Connection: close\r\n\r\n"
# 发送请求
print request |& socket
# 读取响应
while ((socket |& getline) > 0) {
print $0
}
close(socket)
}'
二、进程间通信
1. 管道通信
# 与外部命令交互
awk 'BEGIN {
cmd = "sort -n"
# 发送数据到sort命令
print "30" |& cmd
print "10" |& cmd
print "20" |& cmd
close(cmd, "to") # 关闭写入端
# 读取排序结果
while ((cmd |& getline) > 0) {
print "Sorted:", $0
}
close(cmd)
}'
2. 协同进程
# 创建协同进程
awk 'BEGIN {
coproc = "tr a-z A-Z" # 创建大写转换进程
# 发送数据
print "hello world" |& coproc
# 读取结果
if ((coproc |& getline result) > 0) {
print "Result:", result
}
close(coproc)
}'
三、动态数据结构
1. 动态数组管理
# 动态数组类实现
awk '
BEGIN {
# 初始化数组
init_array("myarray")
# 添加元素
array_push("myarray", "first")
array_push("myarray", "second")
array_push("myarray", "third")
# 获取元素
print "Element at index 1:", array_get("myarray", 1)
# 数组长度
print "Array length:", array_length("myarray")
# 遍历数组
for(i = 0; i < array_length("myarray"); i++) {
print "Index", i, ":", array_get("myarray", i)
}
}
function init_array(name) {
eval(name "_length = 0")
}
function array_push(name, value) {
eval(name "[" eval(name "_length") "] = \"" value "\"")
eval(name "_length++")
}
function array_get(name, index) {
return eval(name "[" index "]")
}
function array_length(name) {
return eval(name "_length")
}
function eval(expr) {
return system("echo " expr " | awk '{print " expr "}'")
}'
2. 栈数据结构
# 栈实现
awk '
BEGIN {
stack_init("mystack")
stack_push("mystack", "item1")
stack_push("mystack", "item2")
stack_push("mystack", "item3")
while(!stack_empty("mystack")) {
print "Popped:", stack_pop("mystack")
}
}
function stack_init(name) {
eval(name "_top = -1")
}
function stack_push(name, item) {
top = eval(name "_top")
top++
eval(name "[" top "] = \"" item "\"")
eval(name "_top = " top)
}
function stack_pop(name) {
top = eval(name "_top")
if(top >= 0) {
item = eval(name "[" top "]")
top--
eval(name "_top = " top)
return item
}
return ""
}
function stack_empty(name) {
return (eval(name "_top") < 0)
}'
四、文件系统操作
1. 目录遍历
# 递归遍历目录
awk 'BEGIN {
traverse_directory(".")
}
function traverse_directory(dir) {
cmd = "find " dir " -type f"
while ((cmd | getline file) > 0) {
print "Found file:", file
# 处理文件...
}
close(cmd)
}'
2. 文件监控
# 监控文件变化
awk 'BEGIN {
file = "monitor.txt"
last_size = get_file_size(file)
while(1) {
current_size = get_file_size(file)
if(current_size != last_size) {
print "File changed! New size:", current_size
last_size = current_size
}
system("sleep 1")
}
}
function get_file_size(filename) {
cmd = "stat -c %s " filename " 2>/dev/null"
if((cmd | getline size) > 0) {
close(cmd)
return size
}
close(cmd)
return 0
}'
五、数据库接口
1. SQLite 集成
# 通过系统调用与SQLite交互
awk 'BEGIN {
db_file = "data.db"
# 创建表
system("sqlite3 " db_file " \"CREATE TABLE IF NOT EXISTS users(id INTEGER PRIMARY KEY, name TEXT, age INTEGER)\"")
# 插入数据
insert_user(db_file, "Alice", 25)
insert_user(db_file, "Bob", 30)
# 查询数据
query_users(db_file)
}
function insert_user(db, name, age) {
cmd = "sqlite3 " db " \"INSERT INTO users(name, age) VALUES (\"\"" name "\"\", " age ")\""
system(cmd)
}
function query_users(db) {
cmd = "sqlite3 " db " \"SELECT * FROM users\""
while((cmd | getline) > 0) {
print "User:", $0
}
close(cmd)
}'
2. CSV 到数据库转换
# CSV导入数据库
awk -F',' '
NR > 1 {
cmd = "sqlite3 data.db \"INSERT INTO records(col1, col2, col3) VALUES (\"\"" $1 "\"\", \"\"" $2 "\"\", \"\"" $3 "\"\")\""
system(cmd)
}
' data.csv
六、并发与并行处理
1. 多进程处理
# 并行处理多个文件
awk 'BEGIN {
files[1] = "file1.txt"
files[2] = "file2.txt"
files[3] = "file3.txt"
# 启动多个进程
for(i in files) {
cmd = "wc -l " files[i] " > result_" i ".txt &"
system(cmd)
}
# 等待所有进程完成
system("wait")
# 收集结果
for(i in files) {
cmd = "cat result_" i ".txt"
if((cmd | getline) > 0) {
print "File", files[i], ":", $1, "lines"
}
close(cmd)
system("rm result_" i ".txt")
}
}'
2. 线程池模拟
# 简单任务队列
awk 'BEGIN {
max_workers = 3
current_workers = 0
# 模拟任务队列
tasks[1] = "task1.sh"
tasks[2] = "task2.sh"
tasks[3] = "task3.sh"
tasks[4] = "task4.sh"
tasks[5] = "task5.sh"
for(i in tasks) {
while(current_workers >= max_workers) {
system("sleep 0.1") # 等待有空闲工作线程
}
# 启动任务
cmd = tasks[i] " &"
system(cmd)
current_workers++
print "Started task:", tasks[i], "(Workers:", current_workers, ")"
}
# 等待所有任务完成
system("wait")
print "All tasks completed"
}'
七、高级正则表达式
1. 复杂模式匹配
# 多行模式匹配
awk '
BEGIN {
RS = "" # 空行分隔记录
}
/ERROR/ && /critical/ {
print "Critical error found:"
print $0
print "---"
}
' system.log
# 嵌套结构解析
awk '
BEGIN {
pattern = "<([a-zA-Z][a-zA-Z0-9]*)[^>]*>([^<]*)</\\1>"
}
{
while(match($0, pattern, arr)) {
tag = arr[1]
content = arr[2]
print "Tag:", tag, "Content:", content
$0 = substr($0, RSTART + RLENGTH)
}
}
' html_file.txt
2. 正则捕获组
# GNU awk 的正则捕获组
awk '{
if(match($0, /([0-9]{4})-([0-9]{2})-([0-9]{2})/, date_parts)) {
year = date_parts[1]
month = date_parts[2]
day = date_parts[3]
print "Year:", year, "Month:", month, "Day:", day
}
}' dates.txt
八、性能分析与优化
1. 性能监控
# 性能分析脚本
awk -p profile.out '
{
# 实际处理逻辑
sum += $1
count++
}
END {
if(count > 0) {
print "Average:", sum/count
}
}
' large_data.txt
# 查看profile.out文件分析性能
2. 内存优化
# 大文件处理优化
awk '
BEGIN {
# 设置缓冲区大小
BINMODE = 3
}
{
# 分批处理,避免内存溢出
if(NR % 10000 == 0) {
print "Processed", NR, "lines" > "/dev/stderr"
}
# 实际处理逻辑
# ...
}
END {
print "Total processed:", NR, "lines"
}' huge_file.txt
九、安全编程实践
1. 输入验证
# 安全的输入处理
awk '
function safe_input(str) {
# 移除危险字符
gsub(/[;&|$`]/, "", str)
return str
}
function validate_number(str) {
return (str ~ /^[0-9]+$/)
}
{
user_input = safe_input($1)
if(validate_number(user_input)) {
print "Valid number:", user_input
} else {
print "Invalid input:", user_input > "/dev/stderr"
}
}' input.txt
2. 沙盒模式
# 使用沙盒模式运行
# gawk -S script.awk data.txt
# 禁止system(), getline等危险函数
awk -S '
BEGIN {
# 在沙盒模式下,这些操作会被禁止
# system("rm -rf /") # 这会失败
# cmd = "/bin/sh" # 这也会失败
print "Running in sandbox mode"
}
{
print $0
}' data.txt
十、高级实用示例
1. 实时日志监控系统
#!/usr/bin/awk -f
# 实时日志分析器
BEGIN {
# 配置
ERROR_THRESHOLD = 10
WINDOW_SIZE = 60 # 60秒窗口
# 初始化统计
start_time = systime()
}
{
# 解析日志行
timestamp = substr($0, 1, 19) # 假设前19个字符是时间戳
if($0 ~ /ERROR|FATAL/) {
error_count++
errors[error_count] = $0
}
# 每分钟报告一次
current_time = systime()
if(current_time - start_time >= 60) {
report_stats()
start_time = current_time
delete errors # 清空窗口数据
error_count = 0
}
}
function report_stats() {
print strftime("%Y-%m-%d %H:%M:%S"), "- Errors in last minute:", error_count
if(error_count > ERROR_THRESHOLD) {
print "ALERT: Error threshold exceeded!"
# 可以发送告警邮件等
}
}
2. 数据可视化工具
# 简单的文本图表生成器
awk '
BEGIN {
max_value = 0
}
{
data[NR] = $1
if($1 > max_value) max_value = $1
}
END {
print "Data Visualization:"
print "=================="
scale = 50 / max_value # 缩放到50个字符宽度
for(i = 1; i <= NR; i++) {
bar_length = int(data[i] * scale)
printf "%3d |", data[i]
for(j = 1; j <= bar_length; j++) {
printf "*"
}
print ""
}
}' numbers.txt
3. 配置管理器
# 高级配置文件解析器
awk '
BEGIN {
# 支持多种配置格式
current_section = "global"
}
# 处理注释
/^ *#/ { next }
# 处理空行
/^ *$/ { next }
# 处理节头 [section]
/^\[.*\]$/ {
sub(/^\[/, "")
sub(/\]$/, "")
current_section = $0
next
}
# 处理键值对
/^[a-zA-Z_]/ && /=/ {
# 支持变量替换
gsub(/\${([^}]+)}/, "\\$" var_map[substr($0, RSTART+2, RLENGTH-3)])
split($0, parts, "=")
key = trim(parts[1])
value = trim(parts[2])
# 存储配置
config[current_section "/" key] = value
var_map[key] = value
}
END {
# 输出所有配置
for(key in config) {
print key " = " config[key]
}
}
function trim(str) {
gsub(/^ +| +$/, "", str)
return str
}' config.ini
这些高级应用展示了 AWK 在系统编程、网络通信、数据库集成等方面的强大能力。掌握这些技能后,你可以用 AWK 构建复杂的系统工具和自动化脚本。