3、緩存的查找當(dāng)數(shù)據(jù)包進(jìn)入網(wǎng)絡(luò)層后,第一個(gè)被調(diào)用的函數(shù)是ip_rcv函數(shù):
/* Main IP Receive routine. */
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
{
struct iphdr *iph;
/* 混雜模式下,數(shù)據(jù)將被丟棄 */
if (skb->pkt_type == PACKET_OTHERHOST)
goto drop;
/*更新SNMP統(tǒng)計(jì)修筑*/
IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES);
/*skb_share_check用于skb的共享檢查,如果有別人已經(jīng)在使用了,則克隆一份給自己使用*/
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
goto out;
}
/*一個(gè)正確的IP包,包長(zhǎng)度應(yīng)該大于或等于包首部長(zhǎng)度*/
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto inhdr_error;
/*取得IP首部*/
iph = skb->nh.iph;
/*
* RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum.
*
* Is the datagram acceptable?
*
* 1. Length at least the size of an ip header
* 2. Version of 4
* 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
* 4. Doesn't have a bogus length
*/
/*長(zhǎng)度和版本檢查*/
if (iph->ihl < 5 || iph->version != 4)
goto inhdr_error;
if (!pskb_may_pull(skb, iph->ihl*4))
goto inhdr_error;
/*因?yàn)槿绻\(yùn)行不好,上邊pskb_may_pull函數(shù)會(huì)進(jìn)一步去調(diào)用__pskb_pull_tail函數(shù),去以完成補(bǔ)全數(shù)據(jù)包的頁(yè)外數(shù)據(jù)的工作,把碎片部分
的數(shù)據(jù)線性重組,所以,有必要重置iph指針,以指向正確的ip 首部*/
iph = skb->nh.iph;
/*校驗(yàn)和檢查*/
if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
goto inhdr_error;
{
__u32 len = ntohs(iph->tot_len);
if (skb->len < len || len < (iph->ihl<<2))
goto inhdr_error;
/* Our transport medium may have padded the buffer out. Now we know it
* is IP we can trim to the true length of the frame.
* Note this now means skb->len holds ntohs(iph->tot_len).
*/
if (pskb_trim_rcsum(skb, len)) {
IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
goto drop;
}
}
/*進(jìn)入Netfilter鉤子,處理完后,繼續(xù)執(zhí)行ip_rcv_finish */
return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
ip_rcv_finish);
inhdr_error:
IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
out:
return NET_RX_DROP;
}
這一部份代碼,簡(jiǎn)而言之,就是取得IP首部,進(jìn)行合法性檢查,然后調(diào)用ip_rcv_finish函數(shù),關(guān)于Netfilter的更多內(nèi)容,請(qǐng)參考九賤的《Linux防火墻設(shè)計(jì)與Nefilter源碼分析》。
ip_rcv_finish 要做的第一件事情,就是調(diào)用ip_route_input函數(shù)進(jìn)行緩存查找:
static inline int ip_rcv_finish(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
struct iphdr *iph = skb->nh.iph;
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
if (skb->dst == NULL) {
if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))
goto drop;
}
……
這就進(jìn)入我們本章的主題了,接下來(lái)看看ip_route_input是如何進(jìn)行緩存查找的。
int ip_route_input(struct sk_buff *skb, //數(shù)據(jù)包
u32 daddr, u32 saddr, //目的地址和源地址
u8 tos, //TOS
struct net_device *dev) //輸入設(shè)備
{
struct rtable * rth;
unsigned hash;
int iif = dev->ifindex;
tos &= IPTOS_RT_MASK;
hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
rcu_read_lock();
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
rth = rcu_dereference(rth->u.rt_next)) {
if (rth->fl.fl4_dst == daddr &&
rth->fl.fl4_src == saddr &&
rth->fl.iif == iif &&
rth->fl.oif == 0 &&
#ifdef CONFIG_IP_ROUTE_FWMARK
rth->fl.fl4_fwmark == skb->nfmark &&
#endif
rth->fl.fl4_tos == tos) {
rth->u.dst.lastuse = jiffies;
dst_hold(&rth->u.dst);
rth->u.dst.__use++;
RT_CACHE_STAT_INC(in_hit);
rcu_read_unlock();
skb->dst = (struct dst_entry*)rth;
return 0;
}
RT_CACHE_STAT_INC(in_hlist_search);
}
rcu_read_unlock();
if (MULTICAST(daddr)) {
struct in_device *in_dev;
rcu_read_lock();
if ((in_dev = __in_dev_get(dev)) != NULL) {
int our = ip_check_mc(in_dev, daddr, saddr,
skb->nh.iph->protocol);
if (our
#ifdef CONFIG_IP_MROUTE
|| (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
#endif
) {
rcu_read_unlock();
return ip_route_input_mc(skb, daddr, saddr,
tos, dev, our);
}
}
rcu_read_unlock();
return -EINVAL;
}
return ip_route_input_slow(skb, daddr, saddr, tos, dev);
}
函數(shù)的第一個(gè)工作,就是根據(jù)目的地址、源地址、接口索引和TOS值計(jì)算hash值
hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
這里用到了rcu鎖,關(guān)于這個(gè)鎖的更多內(nèi)容,可以參考其它相關(guān)資料。宏rcu_dereference在RCU讀臨界部份中取出一個(gè)RCU保護(hù)的指針。在需要內(nèi)存屏障的體系中進(jìn)行內(nèi)存屏障:
#define rcu_dereference(p) ({ \
typeof(p) _________p1 = p; \
smp_read_barrier_depends(); \
(_________p1); \
})
于是,我們有了hash值后,就可以在hash桶中直接找到鏈表入口:
struct rtable * rth;
rth = rcu_dereference(rt_hash_table[hash].chain);
如果要遍歷該鏈表中的所有路由緩存項(xiàng),就可以使用如下循環(huán):
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
rth = rcu_dereference(rth->u.rt_next)) {
……
}
遍歷每一個(gè)緩存項(xiàng)就簡(jiǎn)單,重要的是如何將緩存中的路由特征同數(shù)據(jù)包的特征值進(jìn)行匹配。
struct rtable中的fl成員,用于存儲(chǔ)相關(guān)的路由特征值,也就是路由緩存查找匹配的關(guān)鍵字,它是一個(gè)struct flowi結(jié)構(gòu)類型:
struct flowi {
/*Egress設(shè)備ID和ingress設(shè)備ID*/
int oif;
int iif;
/*該聯(lián)合的各個(gè)字段是可用于指定L3參數(shù)取值的結(jié)構(gòu)。目前支持的協(xié)議為IPv4,IPv6和DECnet。*/
union {
struct {
__u32 daddr;
__u32 saddr;
__u32 fwmark;
__u8 tos;
__u8 scope;
} ip4_u;
struct {
struct in6_addr daddr;
struct in6_addr saddr;
__u32 flowlabel;
} ip6_u;
struct {
__u16 daddr;
__u16 saddr;
__u32 fwmark;
__u8 scope;
} dn_u;
} nl_u;
#define fld_dst nl_u.dn_u.daddr
#define fld_src nl_u.dn_u.saddr
#define fld_fwmark nl_u.dn_u.fwmark
#define fld_scope nl_u.dn_u.scope
#define fl6_dst nl_u.ip6_u.daddr
#define fl6_src nl_u.ip6_u.saddr
#define fl6_flowlabel nl_u.ip6_u.flowlabel
#define fl4_dst nl_u.ip4_u.daddr
#define fl4_src nl_u.ip4_u.saddr
#define fl4_fwmark nl_u.ip4_u.fwmark
#define fl4_tos nl_u.ip4_u.tos
#define fl4_scope nl_u.ip4_u.scope
/*L4協(xié)議*/
__u8 proto;
/*該變量只定義了一個(gè)標(biāo)志,F(xiàn)LOWI_FLAG_MULTIPATHOLDROUTE,它最初用于多路徑代碼,但已不再被使用。*/
__u8 flags;
#define FLOWI_FLAG_MULTIPATHOLDROUTE 0x01
/*該聯(lián)合的各個(gè)字段是可用于指定L4參數(shù)取值的主要結(jié)構(gòu)。目前支持的協(xié)議為T(mén)CP,UDP,ICMP,DECnet和IPsec協(xié)議套件(suite)*/
union {
struct {
__u16 sport;
__u16 dport;
} ports;
struct {
__u8 type;
__u8 code;
} icmpt;
struct {
__u16 sport;
__u16 dport;
__u8 objnum;
__u8 objnamel; /* Not 16 bits since max val is 16 */
__u8 objname[16]; /* Not zero terminated */
} dnports;
__u32 spi;
} uli_u;
#define fl_ip_sport uli_u.ports.sport
#define fl_ip_dport uli_u.ports.dport
#define fl_icmp_type uli_u.icmpt.type
#define fl_icmp_code uli_u.icmpt.code
#define fl_ipsec_spi uli_u.spi
} __attribute__((__aligned__(BITS_PER_LONG/8)));
拋開(kāi)其它協(xié)議和成員,聯(lián)合體成員ip4_u就是IPV4協(xié)議關(guān)心的東東了:
struct {
__u32 daddr;
__u32 saddr;
__u32 fwmark;
__u8 tos;
__u8 scope;
} ip4_u;
于是,在遍歷路由緩存項(xiàng)時(shí),就可以使用如下語(yǔ)句來(lái)匹配路由緩存:
if (rth->fl.fl4_dst == daddr &&
rth->fl.fl4_src == saddr &&
rth->fl.iif == iif &&
rth->fl.oif == 0 &&
#ifdef CONFIG_IP_ROUTE_FWMARK
rth->fl.fl4_fwmark == skb->nfmark &&
#endif
rth->fl.fl4_tos == tos)
分別對(duì)來(lái)源/目的地址,輸入/輸出設(shè)備,Netfilter防火墻的標(biāo)記值和TOS值進(jìn)行匹配。如果手氣好,查找命中了:
rth->u.dst.lastuse = jiffies; //更新最近使用時(shí)間標(biāo)記
dst_hold(&rth->u.dst);
rth->u.dst.__use++; //更新緩存使用記數(shù)器
RT_CACHE_STAT_INC(in_hit);
rcu_read_unlock();
[b]skb->dst = (struct dst_entry*)rth; //設(shè)置skb的dst指針指向路由緩存項(xiàng)[/b]
return 0;
如果沒(méi)有查到,怎么辦?
當(dāng)然,不是次次都能糊清一色的,如果沒(méi)有命中的話,就要去查到路由表了??梢酝葡?,網(wǎng)絡(luò)棧在緩存查找沒(méi)有命中后,會(huì)去搜索路由表,如果路由表匹配,會(huì)將由于產(chǎn)生的路由緩存項(xiàng)插入緩存表,以待下一次使用。關(guān)于路由表查找的相關(guān)內(nèi)容,我會(huì)在下一章中分析。