在向外發(fā)送數(shù)據(jù)包的時候,首先需要查詢路由表來確定路由包的路由,主要由ip_route_output_key()函數(shù)來完成,該函數(shù)又調(diào)用了ip_route_output_flow(),而這個函數(shù)最終又調(diào)用了__ip_route_output_key()這個函數(shù)來進行路由的查詢,下面主要來看一下這個函數(shù): int __ip_route_output_key(struct net *net, struct rtable **rp, const struct flowi *flp){ unsigned int hash; int res; struct rtable *rth; if (!rt_caching(net)) goto slow_output; hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); rcu_read_lock_bh(); for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; rth = rcu_dereference_bh(rth->dst.rt_next)) { if (rth->fl.fl4_dst == flp->fl4_dst && rth->fl.fl4_src == flp->fl4_src && rth->fl.iif == 0 && rth->fl.oif == flp->oif && rth->fl.mark == flp->mark && !((rth->fl.fl4_tos ^ flp->fl4_tos) & (IPTOS_RT_MASK | RTO_ONLINK)) && net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { dst_use(&rth->dst, jiffies); RT_CACHE_STAT_INC(out_hit); rcu_read_unlock_bh(); *rp = rth; return 0; } RT_CACHE_STAT_INC(out_hlist_search); } rcu_read_unlock_bh(); slow_output: rcu_read_lock(); res = ip_route_output_slow(net, rp, flp); rcu_read_unlock(); return res;} Linux的路由表中的常用路由是存儲在路由緩存中的,該路由緩存即是類型為struct rt_hash_bucket的全局列表rt_hash_table,該緩存列表在ip_rt_init()中初始化。 struct flowi結(jié)構(gòu)中包含了查詢路由表所需要的請求信息,是一個搜索健值。由代碼可看出,首先在路由緩存列表rt_hash_table中查詢精確匹配的未過期的路由表項struct rtable,(注,因為是出口路由,所以入口接口號是0),若找到后增加路由表項的引用計數(shù)和后即刻返回。若未找到匹配的路由表項,則繼續(xù)在路由表中查找匹配的路由表項,路由表中的查詢速度會比路由緩存中慢,所以ip_route_output_slow()函數(shù)的命名就不難理解了,主動的路由解析工作都是在這個函數(shù)里面進行的,在看它的定義之前先看下服務類型和路由范圍的相關(guān) 定義: #define IPTOS_TOS_MASK 0x1E#define IPTOS_TOS(tos) ((tos)&IPTOS_TOS_MASK)#define IPTOS_LOWDELAY 0x10 /* 最小延時 */#define IPTOS_THROUGHPUT 0x08 /* 最大吞吐量 */#define IPTOS_RELIABILITY 0x04 /* 最高可靠性 */#define IPTOS_MINCOST 0x02 /* 最小消費 */#define RTO_ONLINK 0x01 由掩碼可知,服務類型實際上用了從第2位到第5位共四位的數(shù)據(jù),表示四種服務類型,而最低位的RTO_ONLINK如果置位,則scope為RT_SCOPE_LINK,或沒有,則scope為RT_SCOPE_UNIVERSE,接下來看看scope的相關(guān)定義: enum rt_scope_t { RT_SCOPE_UNIVERSE=0, /* 表示在空間中的任何位置 *//* User defined values */ RT_SCOPE_SITE=200, RT_SCOPE_LINK=253, /* 與本地直接相連的地址 */ RT_SCOPE_HOST=254, /* 本地地址 */ RT_SCOPE_NOWHERE=255 /* 不可達的地址 */}; 其中值越大所表示的范圍便越精確,實際上這也不是什么范圍的意思,只不過是到目的地址的某種距離的表示。OK,接下來看ip_route_output_slow()函數(shù)的定義: static int ip_route_output_slow(struct net *net, struct rtable **rp, const struct flowi *oldflp){ u32 tos = RT_FL_TOS(oldflp); struct flowi fl = { .nl_u = { .ip4_u = { .daddr = oldflp->fl4_dst, .saddr = oldflp->fl4_src, .tos = tos & IPTOS_RT_MASK, .scope = ((tos & RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE), } }, .mark = oldflp->mark, .iif = net->loopback_dev->ifindex, .oif = oldflp->oif }; struct fib_result res; unsigned int flags = 0; struct net_device *dev_out = NULL; int err; res.fi = NULL;#ifdef CONFIG_IP_MULTIPLE_TABLES res.r = NULL;#endif if (oldflp->fl4_src) { /* 若源地址為組播地址,受限廣播地址(255.255.255.255)或0地址, 均不合法,即刻返回 */ err = -EINVAL; if (ipv4_is_multicast(oldflp->fl4_src) || ipv4_is_lbcast(oldflp->fl4_src) || ipv4_is_zeronet(oldflp->fl4_src)) goto out; if (oldflp->oif == 0 && (ipv4_is_multicast(oldflp->fl4_dst) || ipv4_is_lbcast(oldflp->fl4_dst))) { /* 等價于inet_addr_type(saddr) == RTN_LOCAL, __ip_dev_find()函數(shù)實際是搜索RT_TABLE_LOCAL 路由表中的路由表項,如果未找到對應設(shè)備則返回,因為 Linux不允許環(huán)回接口發(fā)組播或受限廣播 */ dev_out = __ip_dev_find(net, oldflp->fl4_src, false); if (dev_out == NULL) goto out; /* 給外面接口賦值后轉(zhuǎn)去創(chuàng)建路由緩存 */ fl.oif = dev_out->ifindex; goto make_route; } if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ if (!__ip_dev_find(net, oldflp->fl4_src, false)) goto out; } } if (oldflp->oif) { dev_out = dev_get_by_index_rcu(net, oldflp->oif); err = -ENODEV; if (dev_out == NULL) goto out; /* 如果外出接口示啟用或外出接口對應的IPv4數(shù)據(jù)不存在,則返回網(wǎng)絡不可達 */ if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { err = -ENETUNREACH; goto out; } /* 若是本地組播地址或受限廣播地址則直接轉(zhuǎn)去創(chuàng)建路由緩存 */ if (ipv4_is_local_multicast(oldflp->fl4_dst) || ipv4_is_lbcast(oldflp->fl4_dst)) { if (!fl.fl4_src) fl.fl4_src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); goto make_route; } /* 若未指定源地址,則根據(jù)目地地址類型創(chuàng)建選擇一個源地址 */ if (!fl.fl4_src) { if (ipv4_is_multicast(oldflp->fl4_dst)) fl.fl4_src = inet_select_addr(dev_out, 0, fl.fl4_scope); else if (!oldflp->fl4_dst) fl.fl4_src = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } } /* 如果目的地址不存在,則令目的地址等于源地址,若都不存在,則使用環(huán)回接口, 路由類型為本地路由,轉(zhuǎn)而創(chuàng)建路由緩存 */ if (!fl.fl4_dst) { fl.fl4_dst = fl.fl4_src; if (!fl.fl4_dst) fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); dev_out = net->loopback_dev; fl.oif = net->loopback_dev->ifindex; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; }/*OK, 走到這里先總結(jié)一下不需要查詢路由表即可直接創(chuàng)建路由緩存的情況:1. 指定了源地址,未指定外出接口,目的地址為組播地址或受限廣播地址2. 指定了外出接口,并且目的地址為本地組播地址或受限廣播地址3. 未指定目的地址。 若以上三種情況均未滿足,則需要進行路由表查詢。*/ if (fib_lookup(net, &fl, &res)) { res.fi = NULL; if (oldflp->oif) { /* 程序走到這里說明查詢路由表失敗,未找到對應的路由表項, 但卻指定了外出接口,這時候即便沒有路由也是可以發(fā)送數(shù)據(jù)包的。 當然,如果未指定外出接口,則只能返回網(wǎng)絡不可達了。 */ if (fl.fl4_src == 0) fl.fl4_src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); res.type = RTN_UNICAST; goto make_route; } err = -ENETUNREACH; goto out; } /* 若為本地路由,則使用環(huán)回接口 */ if (res.type == RTN_LOCAL) { if (!fl.fl4_src) { if (res.fi->fib_prefsrc) fl.fl4_src = res.fi->fib_prefsrc; else fl.fl4_src = fl.fl4_dst; } dev_out = net->loopback_dev; fl.oif = dev_out->ifindex; res.fi = NULL; flags |= RTCF_LOCAL; goto make_route; } /* 使用默認路由需要三個條件:1. 若前綴為0,也即掩碼長度為0,默認路由匹配所有的目的地址。2. 路由類型為RTN_UNICAST,我們知道本地地址,組播地址和廣播地址3. 未指定出口設(shè)備,上面我們提到即便是沒有路由的情況下提供了出口設(shè)備,數(shù)據(jù)包也是可以發(fā)送的。 這時候路由是默認路由,因此我們需要選擇默認網(wǎng)關(guān) */ if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) fib_select_default(net, &fl, &res); if (!fl.fl4_src) fl.fl4_src = FIB_RES_PREFSRC(res); dev_out = FIB_RES_DEV(res); fl.oif = dev_out->ifindex; make_route: /* 創(chuàng)建一條路由緩存 */ err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); out: return err;} 接下來看下創(chuàng)建路由緩存的函數(shù): static int ip_mkroute_output(struct rtable **rp, struct fib_result *res, const struct flowi *fl, const struct flowi *oldflp, struct net_device *dev_out, unsigned flags){ struct rtable *rth = NULL; int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); unsigned hash; if (err == 0) { hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, rt_genid(dev_net(dev_out))); err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif); } return err;} 該函數(shù)首先調(diào)用__mkroute_output()函數(shù)生成一條路由緩存,然后再調(diào)用rt_intern_hash()函數(shù)寫入到緩存列表中去。 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp, struct sk_buff *skb, int ifindex){ struct rtable *rth, *cand; struct rtable __rcu **rthp, **candp; unsigned long now; u32 min_score; int chain_length; int attempts = !in_softirq(); restart: chain_length = 0; min_score = ~(u32)0; cand = NULL; candp = NULL; now = jiffies; if (!rt_caching(dev_net(rt->dst.dev))) {/* 如果路由未進行緩存,那么把路由的DST標示設(shè)為DST_NOCACHE,調(diào)用者 便會知道這條路由未進行緩存,使用完成之后可以根據(jù)該標志對路由進 行釋放。如果在這里把路由給丟掉的話,那么當沒有進行路由緩存的情況 下調(diào)用都就沒辦不法解析路由信息了。 */ rt->dst.flags |= DST_NOCACHE; if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { int err = arp_bind_neighbour(&rt->dst); if (err) { if (net_ratelimit()) printk(KERN_WARNING \'Neighbour table failure & not caching routes.\\n\'); ip_rt_put(rt); return err; } } goto skip_hashing; } rthp = &rt_hash_table[hash].chain; spin_lock_bh(rt_hash_lock_addr(hash)); /* 開始遍歷哈希鏈表 */ while ((rth = rcu_dereference_protected(*rthp, lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { /* 如果路由已過期,則直接從列表中刪除并釋放內(nèi)存空間 */ if (rt_is_expired(rth)) { *rthp = rth->dst.rt_next; rt_free(rth); continue; } /* 如果未過期,并在列表中找到了匹配的路由,則將該路由緩存項拿到 鏈表的最新端,并增加引用計數(shù),釋放新建待插入的緩存項內(nèi)存。 */ if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { *rthp = rth->dst.rt_next; rcu_assign_pointer(rth->dst.rt_next, rt_hash_table[hash].chain); rcu_assign_pointer(rt_hash_table[hash].chain, rth); dst_use(&rth->dst, now); spin_unlock_bh(rt_hash_lock_addr(hash)); rt_drop(rt); if (rp) *rp = rth; else skb_dst_set(skb, &rth->dst); return 0; } if (!atomic_read(&rth->dst.__refcnt)) { u32 score = rt_score(rth); if (score <= min_score) { cand = rth; candp = rthp; min_score = score; } } chain_length++; rthp = &rth->dst.rt_next; } if (cand) { /* ip_rt_gc_elasticity used to be average length of chain * length, when exceeded gc becomes really aggressive. * * The second limit is less certain. At the moment it allows * only 2 entries per bucket. We will see. */ if (chain_length > ip_rt_gc_elasticity) { *candp = cand->dst.rt_next; rt_free(cand); } } else {/* 如果某個哈希槽上的鏈表長度大于所能接受的鏈表的最大長度, 則說明哈希碰撞太嚴重,需要重構(gòu)哈希表,這個長度目前定義為20。 如果需要重構(gòu)則增加重構(gòu)計數(shù)current_rt_cache_rebuild_count的值, rt_caching()函數(shù)就是簡單地判斷該值是否超過最大值來斷定緩存是否 正在進行的,最大值為4。 */ if (chain_length > rt_chain_length_max && slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) { struct net *net = dev_net(rt->dst.dev); int num = ++net->ipv4.current_rt_cache_rebuild_count; if (!rt_caching(net)) { printk(KERN_WARNING \'%s: %d rebuilds is over limit, route caching disabled\\n\', rt->dst.dev->name, num); } /* 重建哈希列表,然后重新開始此函數(shù) */ rt_emergency_hash_rebuild(net); spin_unlock_bh(rt_hash_lock_addr(hash)); hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, ifindex, rt_genid(net)); goto restart; } } /* 當路由為單播路由或者為外出路由(iif為0的情況即為外出路由) 則需要把路由綁定到arp */ if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { int err = arp_bind_neighbour(&rt->dst); if (err) { spin_unlock_bh(rt_hash_lock_addr(hash)); if (err != -ENOBUFS) { rt_drop(rt); return err; } /* Neighbour tables are full and nothing can be released. Try to shrink route cache, it is most likely it holds some neighbour records. */ if (attempts-- > 0) { int saved_elasticity = ip_rt_gc_elasticity; int saved_int = ip_rt_gc_min_interval; ip_rt_gc_elasticity = 1; ip_rt_gc_min_interval = 0; /* 路由表進行垃圾回收,這個以后再寫 */ rt_garbage_collect(&ipv4_dst_ops); ip_rt_gc_min_interval = saved_int; ip_rt_gc_elasticity = saved_elasticity; goto restart; } if (net_ratelimit()) printk(KERN_WARNING \'ipv4: Neighbour table overflow.\\n\'); rt_drop(rt); return -ENOBUFS; } } /* 將該表項放至哈希鏈表的頭部 */ rt->dst.rt_next = rt_hash_table[hash].chain; /* * Since lookup is lockfree, we must make sure * previous writes to rt are comitted to memory * before making rt visible to other CPUS. */ rcu_assign_pointer(rt_hash_table[hash].chain, rt); spin_unlock_bh(rt_hash_lock_addr(hash)); skip_hashing: if (rp) *rp = rt; else skb_dst_set(skb, &rt->dst); return 0;} 簡單注釋了一下幾個比較重要的函數(shù),求大牛批評指正。 |
|