1. 环境初始化


2. eBPF 书签


3. 推荐项目


4. 示例 1 - tcpstates.bt

#!/usr/bin/env bpftrace
#include <net/tcp_states.h>
#include <net/sock.h>
#include <linux/socket.h>
#include <linux/tcp.h>

BEGIN
{
    printf("%-20s %-7s %-20s %-7s ",
        "LADDR", "LPORT", "RADDR", "RPORT");
    printf("%-11s -> %-11s\n", "OLD", "NEW");
    @tcpstate[0] = "UNKNOWN";
    @tcpstate[1] = "ESTABLISHED";
    @tcpstate[2] = "SYN_SENT";
    @tcpstate[3] = "SYN_RECV";
    @tcpstate[4] = "FIN_WAIT1";
    @tcpstate[5] = "FIN_WAIT2";
    @tcpstate[6] = "TIME_WAIT";
    @tcpstate[7] = "CLOSE";
    @tcpstate[8] = "CLOSE_WAIT";
    @tcpstate[9] = "LAST_ACK";
    @tcpstate[10] = "LISTEN";
    @tcpstate[11] = "CLOSING";
    @tcpstate[12] = "NEW_SYN_RECV";
}

kprobe:tcp_set_state
{
    $sk = (struct sock *)arg0;
    $newstate = arg1;
    $oldstate = $sk->__sk_common.skc_state;

    $lport = $sk->__sk_common.skc_num;
    $dport = $sk->__sk_common.skc_dport;
    $dport = ($dport >> 8) | (($dport << 8) & 0xff00);

    $family = $sk->__sk_common.skc_family;
    $saddr = ntop(0);
    $daddr = ntop(0);
    if ($family == AF_INET) {
        $saddr = ntop(AF_INET, $sk->__sk_common.skc_rcv_saddr);
        $daddr = ntop(AF_INET, $sk->__sk_common.skc_daddr);
    } else {
        // AF_INET6
        $saddr = ntop(AF_INET6,
            $sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr8);
        $daddr = ntop(AF_INET6,
            $sk->__sk_common.skc_v6_daddr.in6_u.u6_addr8);
    }
    if ($newstate > 12) {
        printf("%-20s %-7d %-20s %-7d %-11d -> %-11d\n", $saddr, $lport, $daddr, $dport, $oldstate, $newstate);
    } else {
        printf("%-20s %-7d %-20s %-7d %-11s -> %-11s\n", $saddr, $lport, $daddr, $dport, @tcpstate[$oldstate], @tcpstate[$newstate]);
    }
}

END
{
    clear(@tcpstate)
}

5. 示例 2 - 常用函数原型

tcp_recvmsg:

int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
                int flags, int *addr_len);

tcp_sendmsg:

int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);

tcp_set_state:

void tcp_set_state(struct sock *sk, int state);

6. 示例 3 - Go cilium/ebpf 尾调用示例

视频地址:https://www.bilibili.com/video/BV1zy411h72B/

6.1. go.mod

module ebpftest

go 1.21.10

require github.com/cilium/ebpf v0.15.0

require (
    golang.org/x/exp v0.0.0-20230224173230-c95f2b4c22f2 // indirect
    golang.org/x/sys v0.15.0 // indirect
)

6.2. Makefile

.PHONY: build run clean

build:
    bpf2go -target amd64 --go-package ebpf --output-dir ebpf Bpf kernel/tail_call_test.bpf.c -- -I./kernel
    go mod tidy
    go build -o ebpftest -tags amd64

run: build
    ./ebpftest

clean:
    rm -rf ebpf/* ./ebpftest

6.3. main.go

package main

import (
    "bytes"
    "ebpftest/ebpf"
    "encoding/binary"
    "github.com/cilium/ebpf/link"
    "github.com/cilium/ebpf/perf"
    "github.com/cilium/ebpf/rlimit"
    "log"
    "net"
    "unsafe"
)

// GetByteOrder 获取本机字节序。
// 小端模式:数据的低位存储在内存的低地址上;
// 大端模式:数据的低位存储在内存的高地址上
func GetByteOrder() binary.ByteOrder {
    n := 0x1234
    f := *((*byte)(unsafe.Pointer(&n)))
    if (f ^ 0x34) == 0 {
        return binary.LittleEndian
    }
    return binary.BigEndian
}

// Uint32ToIP 将 uint32 形式的 IPv4 地址转换为 net.IP
func Uint32ToIP(ip uint32) net.IP {
    ipByte := make([]byte, 4)
    GetByteOrder().PutUint32(ipByte, ip)
    return ipByte
}

// Uint32ArrayToIP 将 [4]uint32 形式的 IPv6 地址转换为 net.IP
func Uint32ArrayToIP(ip [4]uint32) net.IP {
    ipByte := make([]byte, 0)
    byteOrder := GetByteOrder()
    for i := 0; i < len(ip); i++ {
        a := make([]byte, 4)
        byteOrder.PutUint32(a, ip[i])
        ipByte = append(ipByte, a...)
    }
    return ipByte
}

func main() {
    // Allow the current process to lock memory for eBPF resources.
    if err := rlimit.RemoveMemlock(); err != nil {
        log.Fatal(err)
    }

    // Load pre-compiled programs and maps into the kernel.
    objs := ebpf.BpfObjects{}
    if err := ebpf.LoadBpfObjects(&objs, nil); err != nil {
        log.Fatalf("loading objects: %v", err)
    }
    defer objs.Close()

    kpEnterTcpSetState, err := link.Kprobe("tcp_set_state", objs.EnterTcpSetState, nil)
    if err != nil {
        log.Fatalf("opening kprobe tcp_set_state: %v", err)
    }
    defer kpEnterTcpSetState.Close()

    err = objs.BpfMaps.TailJmpMap.Put(uint32(0), objs.BpfPrograms.OutputSocketMeta)
    if err != nil {
        log.Fatalf("putting tail jmp map: %v", err)
    }

    log.Println("waiting for events...")

    // 16777216B = 16MB
    reader, err := perf.NewReader(objs.BpfMaps.Events, 16777216)
    if err != nil {
        log.Fatalf("creating perf reader: %v", err)
    }
    for {
        record, err := reader.Read()
        if err != nil {
            log.Printf("reading perf: %v\n", err)
            continue
        }
        var data ebpf.BpfSockMeta
        if err := binary.Read(bytes.NewReader(record.RawSample), GetByteOrder(), &data); err != nil {
            log.Fatalf("unmarhaling binary: %v\n", err)
        }
        if data.Af == 2 {
            log.Printf("%s:%d -> %s:%d", Uint32ToIP(data.LocalIp[3]), data.LocalPort,
                Uint32ToIP(data.RemoteIp[3]), data.RemotePort)
        } else {
            log.Printf("%s:%d -> %s:%d", Uint32ArrayToIP(data.LocalIp), data.LocalPort,
                Uint32ArrayToIP(data.RemoteIp), data.RemotePort)
        }
    }
}

6.4. kernel/tail_call_test.bpf.c

#include <vmlinux.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_core_read.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_endian.h>
#include "tail_call_test.h"

char LICENSE[] SEC("license") = "GPL";

struct {
    __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
    __uint(max_entries, MAX_CPU);
    __type(key, int);
    __type(value, __u32);
} events SEC(".maps");

struct {
    __uint(type, BPF_MAP_TYPE_HASH);
    __uint(max_entries, MAP_MAX_ENTRIES);
    __type(key, __u64);
    __type(value, struct sock_meta);
} sock_meta_map SEC(".maps");

struct {
    __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
    __uint(key_size, sizeof(__u32));
    __uint(value_size, sizeof(__u32));
    __uint(max_entries, MAP_MAX_ENTRIES);
} tail_jmp_map SEC(".maps");

// 生成套接字数据的元数据,返回线程 ID
static __always_inline __u64 generate_socket_data_meta(struct sock_meta *meta, struct sock *sk) {
    __u64 id = bpf_get_current_pid_tgid();
    meta->af = BPF_CORE_READ(sk, __sk_common.skc_family);
    if (meta->af == AF_INET) {
        // 设置本地 IP、远程 IP。对于 IPv4 地址,使用数组的最后一个分量
        meta->local_ip[3] = BPF_CORE_READ(sk, __sk_common.skc_rcv_saddr);
        meta->remote_ip[3] = BPF_CORE_READ(sk, __sk_common.skc_daddr);
    } else if (meta->af == AF_INET6) {
        // 设置本地 IP、远程 IP
        BPF_CORE_READ_INTO(meta->local_ip, sk, __sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
        BPF_CORE_READ_INTO(meta->remote_ip, sk, __sk_common.skc_v6_daddr.in6_u.u6_addr32);
    }
    // 设置本地端口、远程端口
    meta->local_port = BPF_CORE_READ(sk, __sk_common.skc_num);
    meta->remote_port = bpf_ntohs(BPF_CORE_READ(sk, __sk_common.skc_dport));
    // 设置 state
    meta->old_state = meta->new_state = BPF_CORE_READ(sk, __sk_common.skc_state);
    return id;
}

SEC("kprobe/output_socket_meta")
int BPF_KPROBE(output_socket_meta) {
    __u64 id = bpf_get_current_pid_tgid();
    struct sock_meta *meta = (struct sock_meta *)bpf_map_lookup_elem(&sock_meta_map, &id);
    if (!meta) {
        return 0;
    }
    bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, meta, sizeof(*meta));
    return 0;
}

SEC("kprobe/tcp_set_state")
int BPF_KPROBE(enter_tcp_set_state, struct sock *sk, int new_state) {
    struct sock_meta meta = {};
    __u64 id = generate_socket_data_meta(&meta, sk);
    meta.new_state = new_state;
    if (meta.af != AF_INET && meta.af != AF_INET6) {
        return 0;
    }
    if (meta.new_state == TCP_ESTABLISHED) {
        bpf_map_update_elem(&sock_meta_map, &id, &meta, 0);
        bpf_tail_call(ctx, &tail_jmp_map, 0);
    }
    bpf_map_delete_elem(&sock_meta_map, &id);
    return 0;
}

6.5. kernel/tail_call_test.h

#ifndef __TAIL_CALL_TEST_H
#define __TAIL_CALL_TEST_H

#define MAX_CPU 256
#define MAP_MAX_ENTRIES 1024
#define AF_INET 2
#define AF_INET6 10

struct sock_meta {
    __s32 af;
    __u32 local_ip[4];
    __u16 local_port;
    __u32 remote_ip[4];
    __u16 remote_port;
    __s32 old_state;
    __s32 new_state;
};

#endif /* __TAIL_CALL_TEST_H */