本文主要是介绍netfilter连接跟踪统计,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
连接跟踪统计模块初始化函数nf_conncount_modinit如下,首先初始化256(CONNCOUNT_SLOTS)个自旋锁,保护对应的256个槽位,每个槽位对应一个红黑树,每个树节点为结构(nf_conncount_rb),对应于键值;而树节点结构中的链表成员,包含所有匹配键值的连接跟踪(nf_conncount_tuple),以及数量统计。
之后,函数分配两个内核缓存,用于加速以上两个结构体的分配。
static int __init nf_conncount_modinit(void)
{for (i = 0; i < CONNCOUNT_SLOTS; ++i)spin_lock_init(&nf_conncount_locks[i]);conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple",sizeof(struct nf_conncount_tuple),0, 0, NULL);if (!conncount_conn_cachep)return -ENOMEM;conncount_rb_cachep = kmem_cache_create("nf_conncount_rb",sizeof(struct nf_conncount_rb),0, 0, NULL);
需要使用连接跟踪数量统计模块的话,首先进行如下的初始(nf_conncount_init),提供数量统计所使用键值。键值长度keylen要求是4字节的整数倍,最大不能超过20字节(MAX_KEYLEN=5)。
xt_connlimit需要使用连接跟踪数量统计,其将IP地址作为键值。
struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family,unsigned int keylen)
{struct nf_conncount_data *data;if (keylen % sizeof(u32) ||keylen / sizeof(u32) > MAX_KEYLEN ||keylen == 0)return ERR_PTR(-EINVAL);net_get_random_once(&conncount_rnd, sizeof(conncount_rnd));data = kmalloc(sizeof(*data), GFP_KERNEL);if (!data)return ERR_PTR(-ENOMEM);
当前功能需要连接跟踪模块nf_conntrack,和IPv4/IPv6重组模块nf_defrag_ipv4/v6的支持。
ret = nf_ct_netns_get(net, family);if (ret < 0) {kfree(data);return ERR_PTR(ret);}
接下来初始化红黑树根节点,以及启动回收worker。
for (i = 0; i < ARRAY_SIZE(data->root); ++i)data->root[i] = RB_ROOT;data->keylen = keylen / sizeof(u32);data->net = net;INIT_WORK(&data->gc_work, tree_gc_worker);return data;
增加连接跟踪计数
获取指定命名空间中的键值key对应的连接跟踪数量,如果tuple参数不为空,为其创建统计结构。
/* Count and return number of conntrack entries in 'net' with particular 'key'.* If 'tuple' is not null, insert it into the accounting data structure.* Call with RCU read lock.*/
unsigned int nf_conncount_count(struct net *net,struct nf_conncount_data *data,const u32 *key,const struct nf_conntrack_tuple *tuple,const struct nf_conntrack_zone *zone)
{return count_tree(net, data, key, tuple, zone);
}
首先根据键值以及初始化的随机值,确定哈希值,在对应槽位找到红黑树的根。其中,键值较小的项保存在树的左侧;相反,键值较大的值保存在书的右侧。
static unsigned int
count_tree(struct net *net, struct nf_conncount_data *data,const u32 *key, const struct nf_conntrack_tuple *tuple,const struct nf_conntrack_zone *zone)
{struct rb_root *root;struct rb_node *parent;struct nf_conncount_rb *rbconn;u8 keylen = data->keylen;hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS;root = &data->root[hash];parent = rcu_dereference_raw(root->rb_node);while (parent) {int diff;rbconn = rb_entry(parent, struct nf_conncount_rb, node);diff = key_diff(key, rbconn->key, keylen);if (diff < 0) {parent = rcu_dereference_raw(parent->rb_left);} else if (diff > 0) {parent = rcu_dereference_raw(parent->rb_right);} else {
如果键值相等,在参数tuple为空的情况下,返回此项对应的统计值。在此之前,针对此项中的链表,执行回收操作(nf_conncount_gc_list)。
if (!tuple) {nf_conncount_gc_list(net, &rbconn->list);return rbconn->list.count;}
反之,如果参数tuple有值,尝试将其添加到红黑树中。如果此项中连接跟踪的数量为零,表明其可能要被删除,使用insert_tree执行插入操作。否则,直接将tuple添加到此节点的链表中。
链表中的元素个数表示的即为键值key(如IP地址)对应的连接跟踪数量。
spin_lock_bh(&rbconn->list.list_lock);/* Node might be about to be free'd.* We need to defer to insert_tree() in this case.*/if (rbconn->list.count == 0) {spin_unlock_bh(&rbconn->list.list_lock);break;}/* same source network -> be counted! */ret = __nf_conncount_add(net, &rbconn->list, tuple, zone);spin_unlock_bh(&rbconn->list.list_lock);if (ret)return 0; /* hotdrop */elsereturn rbconn->list.count;}}if (!tuple) return 0;return insert_tree(net, data, root, hash, key, tuple, zone);
链表元素新增
根据tuple找到对应的连接跟踪结构哈希结构(nf_conntrack_tuple_hash),之后,可据此找到对应的连接跟踪。
static const struct nf_conntrack_tuple_hash *
find_or_evict(struct net *net, struct nf_conncount_list *list,struct nf_conncount_tuple *conn)
{const struct nf_conntrack_tuple_hash *found;unsigned long a, b;int cpu = raw_smp_processor_id();u32 age;found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple);if (found)return found;
反之,如果没有找到,此表项可能刚刚被其它的处理器添加进来,还没有被确认,这种情况下以上函数nf_conntrack_find_get返回空。接下来确定当前的表项(conn)是否需要移除,条件是:1)此表项就是被当前处理器所添加;或者2)此表项的存在时长大于等于2个jiffies(还没有被确认)。符合以上任意条件,则认为当前表项已经无用,将其释放。
否则,返回重试错误(EAGAIN),再次重试时,表项可能已经被确认。
b = conn->jiffies32;a = (u32)jiffies;/* conn might have been added just before by another cpu and* might still be unconfirmed. In this case, nf_conntrack_find()* returns no result. Thus only evict if this cpu added the* stale entry or if the entry is older than two jiffies.*/age = a - b;if (conn->cpu == cpu || age >= 2) {conn_free(list, conn);return ERR_PTR(-ENOENT);}return ERR_PTR(-EAGAIN);
首先,遍历当前树节点的链表,检查是否有重复表项。
static int __nf_conncount_add(struct net *net,struct nf_conncount_list *list,const struct nf_conntrack_tuple *tuple,const struct nf_conntrack_zone *zone)
{const struct nf_conntrack_tuple_hash *found;struct nf_conncount_tuple *conn, *conn_n;/* check the saved connections */list_for_each_entry_safe(conn, conn_n, &list->head, node) {if (collect > CONNCOUNT_GC_MAX_NODES)break;found = find_or_evict(net, list, conn);if (IS_ERR(found)) {/* Not found, but might be about to be confirmed */if (PTR_ERR(found) == -EAGAIN) {if (nf_ct_tuple_equal(&conn->tuple, tuple) &&nf_ct_zone_id(&conn->zone, conn->zone.dir) ==nf_ct_zone_id(zone, zone->dir))return 0; /* already exists */} else {collect++;}continue;}
如果找到的连接跟踪的tuple与要新增的tuple相等,表明为重复表项,结束处理。否则,如果找到的连接已经关闭,释放相关结构。
found_ct = nf_ct_tuplehash_to_ctrack(found);if (nf_ct_tuple_equal(&conn->tuple, tuple) &&nf_ct_zone_equal(found_ct, zone, zone->dir)) {/** We should not see tuples twice unless someone hooks* this into a table without "-p tcp --syn".** Attempt to avoid a re-add in this case.*/nf_ct_put(found_ct);return 0;} else if (already_closed(found_ct)) {/** we do not care about connections which are* closed already -> ditch it*/nf_ct_put(found_ct);conn_free(list, conn);collect++;continue;}nf_ct_put(found_ct);}
至此,没有找到重复的表项,需要分配新的链表元素结构体(nf_conncount_tuple),初始化之后,添加到链表末尾,增加链表计数值。
if (WARN_ON_ONCE(list->count > INT_MAX))return -EOVERFLOW;conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);if (conn == NULL)return -ENOMEM;conn->tuple = *tuple;conn->zone = *zone;conn->cpu = raw_smp_processor_id();conn->jiffies32 = (u32)jiffies;list_add_tail(&conn->node, &list->head);list->count++;return 0;
红黑树表项新增
由树根节点开始遍历,对比键值,找到键值相等的树节点,将对应于连接跟踪的tuple添加到此节点中的链表内。
static unsigned int
insert_tree(struct net *net,struct nf_conncount_data *data,struct rb_root *root,unsigned int hash,const u32 *key,const struct nf_conntrack_tuple *tuple,const struct nf_conntrack_zone *zone)
{struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];struct rb_node **rbnode, *parent;struct nf_conncount_rb *rbconn;struct nf_conncount_tuple *conn;unsigned int count = 0, gc_count = 0;u8 keylen = data->keylen;bool do_gc = true;spin_lock_bh(&nf_conncount_locks[hash]);
restart:parent = NULL;rbnode = &(root->rb_node);while (*rbnode) {int diff;rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node);parent = *rbnode;diff = key_diff(key, rbconn->key, keylen);if (diff < 0) {rbnode = &((*rbnode)->rb_left);} else if (diff > 0) {rbnode = &((*rbnode)->rb_right);} else {ret = nf_conncount_add(net, &rbconn->list, tuple, zone);if (ret)count = 0; /* hotdrop */elsecount = rbconn->list.count;tree_nodes_free(root, gc_nodes, gc_count);goto out_unlock;}if (gc_count >= ARRAY_SIZE(gc_nodes))continue;if (do_gc && nf_conncount_gc_list(net, &rbconn->list))gc_nodes[gc_count++] = rbconn;}
如果在以上遍历过程中,发现空的树节点(其链表为空),将其释放。如果红黑树遍历结束,还存在未释放的节点,执行释放操作,
if (gc_count) {tree_nodes_free(root, gc_nodes, gc_count);schedule_gc_worker(data, hash);gc_count = 0;do_gc = false;goto restart;}
如果键值相等的树节点不存在,分配新的树节点结构(rbconn),以及新的连接跟踪计数结构(conn),并进行初始化。将新的树节点添加到红黑树中,并将连接跟踪计数结构链接到新节点的链表中。
/* expected case: match, insert new node */rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);if (rbconn == NULL)goto out_unlock;conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);if (conn == NULL) {kmem_cache_free(conncount_rb_cachep, rbconn);goto out_unlock;}conn->tuple = *tuple;conn->zone = *zone;memcpy(rbconn->key, key, sizeof(u32) * keylen);nf_conncount_list_init(&rbconn->list);list_add(&conn->node, &rbconn->list.head);count = 1;rbconn->list.count = count;rb_link_node_rcu(&rbconn->node, parent, rbnode);rb_insert_color(&rbconn->node, root);
out_unlock:spin_unlock_bh(&nf_conncount_locks[hash]);return count;
树节点回收
如下链表元素回收函数nf_conncount_gc_list,遍历节点的链表,如果连接跟踪已经关闭,将其对应的计数结构释放。如果遇到一个未关闭的连接跟踪,并且已经关闭了8个(CONNCOUNT_GC_MAX_NODES)计数结构,提前提出遍历。
最后,如果链表中元素为空,返回真。
bool nf_conncount_gc_list(struct net *net, struct nf_conncount_list *list)
{const struct nf_conntrack_tuple_hash *found;struct nf_conncount_tuple *conn, *conn_n;struct nf_conn *found_ct;bool ret = false;/* don't bother if other cpu is already doing GC */if (!spin_trylock(&list->list_lock)) return false;list_for_each_entry_safe(conn, conn_n, &list->head, node) {found = find_or_evict(net, list, conn);if (IS_ERR(found)) {if (PTR_ERR(found) == -ENOENT)collected++;continue;}found_ct = nf_ct_tuplehash_to_ctrack(found);if (already_closed(found_ct)) {/* we do not care about connections which are closed already -> ditch it*/nf_ct_put(found_ct);conn_free(list, conn);collected++;continue;}nf_ct_put(found_ct);if (collected > CONNCOUNT_GC_MAX_NODES) break;}if (!list->count) ret = true;
遍历指定的红黑树,如果某个树节点中连接跟踪为空,gc_count递增1。如果空节点数量小于8(CONNCOUNT_GC_MAX_NODES),不进行处理。
static void tree_gc_worker(struct work_struct *work)
{struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work);struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn;struct rb_root *root;struct rb_node *node;unsigned int tree, next_tree, gc_count = 0;tree = data->gc_tree % CONNCOUNT_SLOTS;root = &data->root[tree];local_bh_disable();rcu_read_lock();for (node = rb_first(root); node != NULL; node = rb_next(node)) {rbconn = rb_entry(node, struct nf_conncount_rb, node);if (nf_conncount_gc_list(data->net, &rbconn->list))gc_count++;}rcu_read_unlock();local_bh_enable();cond_resched();spin_lock_bh(&nf_conncount_locks[tree]);if (gc_count < ARRAY_SIZE(gc_nodes))goto next; /* do not bother */
再次遍历此红黑树,当找到8个空节点之后,由函数tree_nodes_free执行释放操作。
gc_count = 0;node = rb_first(root);while (node != NULL) {rbconn = rb_entry(node, struct nf_conncount_rb, node);node = rb_next(node);if (rbconn->list.count > 0)continue;gc_nodes[gc_count++] = rbconn;if (gc_count >= ARRAY_SIZE(gc_nodes)) {tree_nodes_free(root, gc_nodes, gc_count);gc_count = 0;}}tree_nodes_free(root, gc_nodes, gc_count);
找到下一个等待回收的红黑树,再次调度worker。
next:clear_bit(tree, data->pending_trees);next_tree = (tree + 1) % CONNCOUNT_SLOTS;next_tree = find_next_bit(data->pending_trees, CONNCOUNT_SLOTS, next_tree);if (next_tree < CONNCOUNT_SLOTS) {data->gc_tree = next_tree;schedule_work(work);}
内核版本 5.10
这篇关于netfilter连接跟踪统计的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!