netfilter连接跟踪统计

本文主要是介绍netfilter连接跟踪统计，希望对大家解决编程问题提供一定的参考价值，需要的开发者们随着小编来一起学习吧！

连接跟踪统计模块初始化函数nf_conncount_modinit如下，首先初始化256（CONNCOUNT_SLOTS）个自旋锁，保护对应的256个槽位，每个槽位对应一个红黑树，每个树节点为结构（nf_conncount_rb），对应于键值；而树节点结构中的链表成员，包含所有匹配键值的连接跟踪（nf_conncount_tuple），以及数量统计。

之后，函数分配两个内核缓存，用于加速以上两个结构体的分配。

static int __init nf_conncount_modinit(void)
{for (i = 0; i < CONNCOUNT_SLOTS; ++i)spin_lock_init(&nf_conncount_locks[i]);conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple",sizeof(struct nf_conncount_tuple),0, 0, NULL);if (!conncount_conn_cachep)return -ENOMEM;conncount_rb_cachep = kmem_cache_create("nf_conncount_rb",sizeof(struct nf_conncount_rb),0, 0, NULL);

需要使用连接跟踪数量统计模块的话，首先进行如下的初始（nf_conncount_init），提供数量统计所使用键值。键值长度keylen要求是4字节的整数倍，最大不能超过20字节（MAX_KEYLEN=5）。

xt_connlimit需要使用连接跟踪数量统计，其将IP地址作为键值。

struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family,unsigned int keylen)
{struct nf_conncount_data *data;if (keylen % sizeof(u32) ||keylen / sizeof(u32) > MAX_KEYLEN ||keylen == 0)return ERR_PTR(-EINVAL);net_get_random_once(&conncount_rnd, sizeof(conncount_rnd));data = kmalloc(sizeof(*data), GFP_KERNEL);if (!data)return ERR_PTR(-ENOMEM);

当前功能需要连接跟踪模块nf_conntrack，和IPv4/IPv6重组模块nf_defrag_ipv4/v6的支持。

    ret = nf_ct_netns_get(net, family);if (ret < 0) {kfree(data);return ERR_PTR(ret);}

接下来初始化红黑树根节点，以及启动回收worker。

    for (i = 0; i < ARRAY_SIZE(data->root); ++i)data->root[i] = RB_ROOT;data->keylen = keylen / sizeof(u32);data->net = net;INIT_WORK(&data->gc_work, tree_gc_worker);return data;

增加连接跟踪计数

获取指定命名空间中的键值key对应的连接跟踪数量，如果tuple参数不为空，为其创建统计结构。

/* Count and return number of conntrack entries in 'net' with particular 'key'.* If 'tuple' is not null, insert it into the accounting data structure.* Call with RCU read lock.*/ 
unsigned int nf_conncount_count(struct net *net,struct nf_conncount_data *data,const u32 *key,const struct nf_conntrack_tuple *tuple,const struct nf_conntrack_zone *zone)
{return count_tree(net, data, key, tuple, zone);
}

首先根据键值以及初始化的随机值，确定哈希值，在对应槽位找到红黑树的根。其中，键值较小的项保存在树的左侧；相反，键值较大的值保存在书的右侧。

static unsigned int
count_tree(struct net *net, struct nf_conncount_data *data,const u32 *key, const struct nf_conntrack_tuple *tuple,const struct nf_conntrack_zone *zone)
{struct rb_root *root;struct rb_node *parent;struct nf_conncount_rb *rbconn;u8 keylen = data->keylen;hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS;root = &data->root[hash];parent = rcu_dereference_raw(root->rb_node);while (parent) {int diff;rbconn = rb_entry(parent, struct nf_conncount_rb, node);diff = key_diff(key, rbconn->key, keylen);if (diff < 0) {parent = rcu_dereference_raw(parent->rb_left);} else if (diff > 0) {parent = rcu_dereference_raw(parent->rb_right);} else {

如果键值相等，在参数tuple为空的情况下，返回此项对应的统计值。在此之前，针对此项中的链表，执行回收操作（nf_conncount_gc_list）。

            if (!tuple) {nf_conncount_gc_list(net, &rbconn->list);return rbconn->list.count;}

反之，如果参数tuple有值，尝试将其添加到红黑树中。如果此项中连接跟踪的数量为零，表明其可能要被删除，使用insert_tree执行插入操作。否则，直接将tuple添加到此节点的链表中。

链表中的元素个数表示的即为键值key（如IP地址）对应的连接跟踪数量。

            spin_lock_bh(&rbconn->list.list_lock);/* Node might be about to be free'd.* We need to defer to insert_tree() in this case.*/if (rbconn->list.count == 0) {spin_unlock_bh(&rbconn->list.list_lock);break;}/* same source network -> be counted! */ret = __nf_conncount_add(net, &rbconn->list, tuple, zone);spin_unlock_bh(&rbconn->list.list_lock);if (ret)return 0; /* hotdrop */elsereturn rbconn->list.count;}}if (!tuple) return 0;return insert_tree(net, data, root, hash, key, tuple, zone);

链表元素新增

根据tuple找到对应的连接跟踪结构哈希结构（nf_conntrack_tuple_hash），之后，可据此找到对应的连接跟踪。

static const struct nf_conntrack_tuple_hash *
find_or_evict(struct net *net, struct nf_conncount_list *list,struct nf_conncount_tuple *conn)
{const struct nf_conntrack_tuple_hash *found;unsigned long a, b;int cpu = raw_smp_processor_id();u32 age;found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple);if (found)return found;

反之，如果没有找到，此表项可能刚刚被其它的处理器添加进来，还没有被确认，这种情况下以上函数nf_conntrack_find_get返回空。接下来确定当前的表项（conn）是否需要移除，条件是：1）此表项就是被当前处理器所添加；或者2）此表项的存在时长大于等于2个jiffies（还没有被确认）。符合以上任意条件，则认为当前表项已经无用，将其释放。

否则，返回重试错误（EAGAIN），再次重试时，表项可能已经被确认。

    b = conn->jiffies32;a = (u32)jiffies;/* conn might have been added just before by another cpu and* might still be unconfirmed.  In this case, nf_conntrack_find()* returns no result.  Thus only evict if this cpu added the* stale entry or if the entry is older than two jiffies.*/age = a - b;if (conn->cpu == cpu || age >= 2) {conn_free(list, conn);return ERR_PTR(-ENOENT);}return ERR_PTR(-EAGAIN);

首先，遍历当前树节点的链表，检查是否有重复表项。

static int __nf_conncount_add(struct net *net,struct nf_conncount_list *list,const struct nf_conntrack_tuple *tuple,const struct nf_conntrack_zone *zone)
{const struct nf_conntrack_tuple_hash *found;struct nf_conncount_tuple *conn, *conn_n;/* check the saved connections */list_for_each_entry_safe(conn, conn_n, &list->head, node) {if (collect > CONNCOUNT_GC_MAX_NODES)break;found = find_or_evict(net, list, conn);if (IS_ERR(found)) {/* Not found, but might be about to be confirmed */if (PTR_ERR(found) == -EAGAIN) {if (nf_ct_tuple_equal(&conn->tuple, tuple) &&nf_ct_zone_id(&conn->zone, conn->zone.dir) ==nf_ct_zone_id(zone, zone->dir))return 0; /* already exists */} else {collect++;}continue;}

如果找到的连接跟踪的tuple与要新增的tuple相等，表明为重复表项，结束处理。否则，如果找到的连接已经关闭，释放相关结构。

        found_ct = nf_ct_tuplehash_to_ctrack(found);if (nf_ct_tuple_equal(&conn->tuple, tuple) &&nf_ct_zone_equal(found_ct, zone, zone->dir)) {/** We should not see tuples twice unless someone hooks* this into a table without "-p tcp --syn".** Attempt to avoid a re-add in this case.*/nf_ct_put(found_ct);return 0;} else if (already_closed(found_ct)) {/** we do not care about connections which are* closed already -> ditch it*/nf_ct_put(found_ct);conn_free(list, conn);collect++;continue;}nf_ct_put(found_ct);}

至此，没有找到重复的表项，需要分配新的链表元素结构体（nf_conncount_tuple），初始化之后，添加到链表末尾，增加链表计数值。

    if (WARN_ON_ONCE(list->count > INT_MAX))return -EOVERFLOW;conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);if (conn == NULL)return -ENOMEM;conn->tuple = *tuple;conn->zone = *zone;conn->cpu = raw_smp_processor_id();conn->jiffies32 = (u32)jiffies;list_add_tail(&conn->node, &list->head);list->count++;return 0;

红黑树表项新增

由树根节点开始遍历，对比键值，找到键值相等的树节点，将对应于连接跟踪的tuple添加到此节点中的链表内。

static unsigned int
insert_tree(struct net *net,struct nf_conncount_data *data,struct rb_root *root,unsigned int hash,const u32 *key,const struct nf_conntrack_tuple *tuple,const struct nf_conntrack_zone *zone)
{struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];struct rb_node **rbnode, *parent;struct nf_conncount_rb *rbconn;struct nf_conncount_tuple *conn;unsigned int count = 0, gc_count = 0;u8 keylen = data->keylen;bool do_gc = true;spin_lock_bh(&nf_conncount_locks[hash]);
restart:parent = NULL;rbnode = &(root->rb_node);while (*rbnode) {int diff;rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node);parent = *rbnode;diff = key_diff(key, rbconn->key, keylen);if (diff < 0) {rbnode = &((*rbnode)->rb_left);} else if (diff > 0) {rbnode = &((*rbnode)->rb_right);} else {ret = nf_conncount_add(net, &rbconn->list, tuple, zone);if (ret)count = 0; /* hotdrop */elsecount = rbconn->list.count;tree_nodes_free(root, gc_nodes, gc_count);goto out_unlock;}if (gc_count >= ARRAY_SIZE(gc_nodes))continue;if (do_gc && nf_conncount_gc_list(net, &rbconn->list))gc_nodes[gc_count++] = rbconn;}

如果在以上遍历过程中，发现空的树节点（其链表为空），将其释放。如果红黑树遍历结束，还存在未释放的节点，执行释放操作，

    if (gc_count) {tree_nodes_free(root, gc_nodes, gc_count);schedule_gc_worker(data, hash);gc_count = 0;do_gc = false;goto restart;}

如果键值相等的树节点不存在，分配新的树节点结构（rbconn），以及新的连接跟踪计数结构（conn），并进行初始化。将新的树节点添加到红黑树中，并将连接跟踪计数结构链接到新节点的链表中。

    /* expected case: match, insert new node */rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);if (rbconn == NULL)goto out_unlock;conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);if (conn == NULL) {kmem_cache_free(conncount_rb_cachep, rbconn);goto out_unlock;}conn->tuple = *tuple;conn->zone = *zone;memcpy(rbconn->key, key, sizeof(u32) * keylen);nf_conncount_list_init(&rbconn->list);list_add(&conn->node, &rbconn->list.head);count = 1;rbconn->list.count = count;rb_link_node_rcu(&rbconn->node, parent, rbnode);rb_insert_color(&rbconn->node, root);
out_unlock:spin_unlock_bh(&nf_conncount_locks[hash]);return count;

树节点回收

如下链表元素回收函数nf_conncount_gc_list，遍历节点的链表，如果连接跟踪已经关闭，将其对应的计数结构释放。如果遇到一个未关闭的连接跟踪，并且已经关闭了8个（CONNCOUNT_GC_MAX_NODES）计数结构，提前提出遍历。

最后，如果链表中元素为空，返回真。

bool nf_conncount_gc_list(struct net *net, struct nf_conncount_list *list)
{const struct nf_conntrack_tuple_hash *found;struct nf_conncount_tuple *conn, *conn_n;struct nf_conn *found_ct;bool ret = false;/* don't bother if other cpu is already doing GC */if (!spin_trylock(&list->list_lock)) return false;list_for_each_entry_safe(conn, conn_n, &list->head, node) {found = find_or_evict(net, list, conn);if (IS_ERR(found)) {if (PTR_ERR(found) == -ENOENT)collected++;continue;}found_ct = nf_ct_tuplehash_to_ctrack(found);if (already_closed(found_ct)) {/* we do not care about connections which are closed already -> ditch it*/nf_ct_put(found_ct);conn_free(list, conn);collected++;continue;}nf_ct_put(found_ct);if (collected > CONNCOUNT_GC_MAX_NODES) break;}if (!list->count) ret = true;

遍历指定的红黑树，如果某个树节点中连接跟踪为空，gc_count递增1。如果空节点数量小于8（CONNCOUNT_GC_MAX_NODES），不进行处理。

static void tree_gc_worker(struct work_struct *work)
{struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work);struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn;struct rb_root *root;struct rb_node *node;unsigned int tree, next_tree, gc_count = 0;tree = data->gc_tree % CONNCOUNT_SLOTS;root = &data->root[tree];local_bh_disable();rcu_read_lock();for (node = rb_first(root); node != NULL; node = rb_next(node)) {rbconn = rb_entry(node, struct nf_conncount_rb, node);if (nf_conncount_gc_list(data->net, &rbconn->list))gc_count++;}rcu_read_unlock();local_bh_enable();cond_resched();spin_lock_bh(&nf_conncount_locks[tree]);if (gc_count < ARRAY_SIZE(gc_nodes))goto next; /* do not bother */

再次遍历此红黑树，当找到8个空节点之后，由函数tree_nodes_free执行释放操作。

    gc_count = 0;node = rb_first(root);while (node != NULL) {rbconn = rb_entry(node, struct nf_conncount_rb, node);node = rb_next(node);if (rbconn->list.count > 0)continue;gc_nodes[gc_count++] = rbconn;if (gc_count >= ARRAY_SIZE(gc_nodes)) {tree_nodes_free(root, gc_nodes, gc_count);gc_count = 0;}}tree_nodes_free(root, gc_nodes, gc_count);

找到下一个等待回收的红黑树，再次调度worker。

next:clear_bit(tree, data->pending_trees);next_tree = (tree + 1) % CONNCOUNT_SLOTS;next_tree = find_next_bit(data->pending_trees, CONNCOUNT_SLOTS, next_tree);if (next_tree < CONNCOUNT_SLOTS) {data->gc_tree = next_tree;schedule_work(work);}

内核版本 5.10

这篇关于netfilter连接跟踪统计的文章就介绍到这儿，希望我们推荐的文章对编程师们有所帮助！