Event-based signaling mechanisms: they allow to deliver events so that user-space processes do not have to poll for data to make sure that have up-to-date information on some kernel-space aspect.
link: Communicating between the kernel and user-space in Linux using Netlink sockets
之前写的《Netlink 监听 XFRM 状态消息 》 就用到了事件信号通知机制。
用户态进程阻塞在recvmsgs()上,等待内核消息;
当NEWSA
事件发生时会通知所有监听该事件的用户态进程;
用户态进程的recvmsgs() 收到数据返回;
下面以NEWSA
事件为例分析下内核中Netlink处理的流程。以消息类型XFRM_MSG_NEWSA
为线索在内核源码中穿梭。
xfrm_add_sa
通过查找XFRM_MSG_NEWSA
,快速发现了xfrm_add_sa()函数和km_event.
// linux/net/xfrm/xfrm_user.c
static int xfrm_add_sa ( struct sk_buff * skb , struct nlmsghdr * nlh ,
struct nlattr ** attrs )
{
struct net * net = sock_net ( skb -> sk );
struct xfrm_usersa_info * p = nlmsg_data ( nlh );
struct xfrm_state * x ;
struct km_event c ;
err = verify_newsa_info ( p , attrs );
x = xfrm_state_construct ( net , p , attrs , & err );
xfrm_state_hold ( x );
if ( nlh -> nlmsg_type == XFRM_MSG_NEWSA )
err = xfrm_state_add ( x );
c . seq = nlh -> nlmsg_seq ;
c . portid = nlh -> nlmsg_pid ;
c . event = nlh -> nlmsg_type ;
km_state_notify ( x , & c );
}
从函数名来看,xfrm_add_sa 是将XFRM_MSG_NEWSA类型的netlink message解析并添加到内核的SAD中,最后将该事件广播出去。
先看是谁调用的xfrm_add_sa()
再分析km_state_notify()
从netlink_socket创建到调用xfrm_add_sa
我是逆向查找的xfrm_add_sa()调用过程,为了便于查看,我把结果给“正”了过来。
static int __net_init xfrm_user_net_init ( struct net * net )
{
struct sock * nlsk ;
struct netlink_kernel_cfg cfg = {
. groups = XFRMNLGRP_MAX ,
. input = xfrm_netlink_rcv ,
};
nlsk = netlink_kernel_create ( net , NETLINK_XFRM , & cfg );
if ( nlsk == NULL )
return - ENOMEM ;
net -> xfrm . nlsk_stash = nlsk ; /* Don't set to NULL */
rcu_assign_pointer ( net -> xfrm . nlsk , nlsk );
return 0 ;
}
注意 socket注册的Input处理函数,忽略groups
消息类型与处理函数通过 xfrm_link
建立了映射
static void xfrm_netlink_rcv ( struct sk_buff * skb )
{
netlink_rcv_skb ( skb , & xfrm_user_rcv_msg );
}
struct xfrm_link xfrm_dispatch [] = {
[ XFRM_MSG_NEWSA - XFRM_MSG_BASE ] = { . doit = xfrm_add_sa },
...
};
static int xfrm_user_rcv_msg ( struct sk_buff * skb , struct nlmsghdr * nlh )
{
struct net * net = sock_net ( skb -> sk );
struct nlattr * attrs [ XFRMA_MAX + 1 ];
const struct xfrm_link * link ;
type = nlh -> nlmsg_type ;
type -= XFRM_MSG_BASE ;
link = & xfrm_dispatch [ type ];
err = nlmsg_parse ( nlh , xfrm_msg_min [ type ], attrs , XFRMA_MAX ,
xfrma_policy );
return link -> doit ( skb , nlh , attrs );
}
km_state_notify()分析
参数包含了xfrm_state 的所有信息,保证了监听程序能得到所有的SA信息
void km_state_notify ( struct xfrm_state * x , const struct km_event * c )
{
struct xfrm_mgr * km ;
rcu_read_lock ();
list_for_each_entry_rcu ( km , & xfrm_km_list , list )
if ( km -> notify )
km -> notify ( x , c );
rcu_read_unlock ();
}
事件类型和信号函数通过 xfrm_mgr
关联起来
static struct xfrm_mgr netlink_mgr = {
. id = "netlink" ,
. notify = xfrm_send_state_notify ,
. acquire = xfrm_send_acquire ,
. compile_policy = xfrm_compile_policy ,
. notify_policy = xfrm_send_policy_notify ,
. report = xfrm_send_report ,
. migrate = xfrm_send_migrate ,
. new_mapping = xfrm_send_mapping ,
};
static int __init xfrm_user_init ( void )
{
rv = xfrm_register_km ( & netlink_mgr );
}
static LIST_HEAD ( xfrm_km_list );
int xfrm_register_km ( struct xfrm_mgr * km )
{
list_add_tail_rcu ( & km -> list , & xfrm_km_list );
}
static int xfrm_send_state_notify ( struct xfrm_state * x , const struct km_event * c )
{
switch ( c -> event ) {
case XFRM_MSG_NEWSA :
return xfrm_notify_sa ( x , c );
}
}
static int xfrm_notify_sa ( struct xfrm_state * x , const struct km_event * c )
{
struct net * net = xs_net ( x );
struct xfrm_usersa_info * p ;
struct xfrm_usersa_id * id ;
struct nlmsghdr * nlh ;
struct sk_buff * skb ;
skb = nlmsg_new ( len , GFP_ATOMIC );
err = copy_to_user_state_extra ( x , p , skb );
return nlmsg_multicast ( net -> xfrm . nlsk , skb , 0 , XFRMNLGRP_SA , GFP_ATOMIC );
}
nlmsg_multicast
nlmsg_multicast
netlink_broadcast
netlink_broadcast_filtered
do_one_broadcast
如何判断一个socket是不是加入了广播组呢?
static int do_one_broadcast ( struct sock * sk ,
struct netlink_broadcast_data * p )
{
struct netlink_sock * nlk = nlk_sk ( sk );
if ( nlk -> portid == p -> portid || p -> group - 1 >= nlk -> ngroups ||
! test_bit ( p -> group - 1 , nlk -> groups ))
goto out ;
out:
return 0 ;
}
test_bit()
static inline int test_bit ( int nr , const volatile void * addr )
{
return ( 1UL & ((( const int * ) addr )[ nr >> 5 ] >> ( nr & 31 ))) != 0UL ;
}
addr指向一个32位整数数组
nr»5 即 nr/32
所以((const int *) addr)[nr >> 5]
表示定位数组addr中第 nr/32 个元素.
nr & 31 即 nr%32
所以 (((const int *) addr)[nr >> 5] >> (nr & 31))
表示数组addr中第 nr/32 个元素的 第 nr%32 位 .
所以test_bit(nr, addr)的作用是
判断数组addr中第 nr/32 个元素的第 nr%32 位是否为1
test_bit()搞明白了,其中第一个参数p->group
也知道,它此处就是XFRMGRP_SA
;
但是 nlk->groups
是什么呢? 还得回到 netlink_broadcast_filtered
中
netlink广播组
struct netlink_table {
struct nl_portid_hash hash ;
struct hlist_head mc_list ;
struct listeners __rcu * listeners ;
unsigned int flags ;
unsigned int groups ;
struct mutex * cb_mutex ;
struct module * module ;
void ( * bind )( int group );
bool ( * compare )( struct net * net , struct sock * sock );
int registered ;
};
struct netlink_table * nl_table ;
EXPORT_SYMBOL_GPL ( nl_table );
int netlink_broadcast_filtered ( struct sock * ssk , struct sk_buff * skb , u32 portid ,
u32 group , gfp_t allocation ,
int ( * filter )( struct sock * dsk , struct sk_buff * skb , void * data ),
void * filter_data )
{
struct net * net = sock_net ( ssk );
struct netlink_broadcast_data info ;
struct sock * sk ;
skb = netlink_trim ( skb , allocation );
//construct info
info . group = group ;
sk_for_each_bound ( sk , & nl_table [ ssk -> sk_protocol ]. mc_list )
do_one_broadcast ( sk , & info );
}
遍历 nl_table
,对每一个sock判断其是否包含info.group,如果是就发送数据info.
重点就是 nl_table,如果还记得 nl_join_groups()
,当时是否还好奇join到哪里去了,现在看来,
join group 就是将socket和广播组的映射保存到内核的nl_table中。
一个socket加入一个group的流程是这样的
struct sockaddr_nl addr;
int nl_sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
addr.nl_family = AF_NETLINK;
addr.nl_groups = /*RTMGRP_LINK | */RTMGRP_IPV4_IFADDR | RTMGRP_IPV4_ROUTE;
bind(nl_sock, (struct sockaddr *)&addr, sizeof(addr));
join group
应该发生在 bind系统调用过程中。
bind()
SYSCALL_DEFINE3 ( bind , int , fd , struct sockaddr __user * , umyaddr , int , addrlen )
{
struct socket * sock ;
struct sockaddr_storage address ;
int err , fput_needed ;
sock = sockfd_lookup_light ( fd , & err , & fput_needed );
if ( sock ) {
err = move_addr_to_kernel ( umyaddr , addrlen , & address );
if ( err >= 0 ) {
err = security_socket_bind ( sock ,
( struct sockaddr * ) & address ,
addrlen );
if ( ! err )
err = sock -> ops -> bind ( sock ,
( struct sockaddr * )
& address , addrlen );
}
fput_light ( sock -> file , fput_needed );
}
return err ;
}
static const struct proto_ops netlink_ops = {
. family = PF_NETLINK ,
. owner = THIS_MODULE ,
. release = netlink_release ,
. bind = netlink_bind ,
. connect = netlink_connect ,
. socketpair = sock_no_socketpair ,
. accept = sock_no_accept ,
. getname = netlink_getname ,
. poll = netlink_poll ,
. ioctl = sock_no_ioctl ,
. listen = sock_no_listen ,
. shutdown = sock_no_shutdown ,
. setsockopt = netlink_setsockopt ,
. getsockopt = netlink_getsockopt ,
. sendmsg = netlink_sendmsg ,
. recvmsg = netlink_recvmsg ,
. mmap = netlink_mmap ,
. sendpage = sock_no_sendpage ,
};
static int netlink_create ( struct net * net , struct socket * sock , int protocol ,
int kern )
{
err = __netlink_create ( net , sock , cb_mutex , protocol );
}
static int __netlink_create ( struct net * net , struct socket * sock ,
struct mutex * cb_mutex , int protocol )
{
sock -> ops = & netlink_ops ;
}
bind 最终调用了netlink_bind
static int netlink_bind ( struct socket * sock , struct sockaddr * addr ,
int addr_len )
{
struct sock * sk = sock -> sk ;
struct net * net = sock_net ( sk );
struct netlink_sock * nlk = nlk_sk ( sk );
struct sockaddr_nl * nladdr = ( struct sockaddr_nl * ) addr ;
/* Only superuser is allowed to listen multicasts */
if ( nladdr -> nl_groups ) {
if ( ! netlink_capable ( sock , NL_CFG_F_NONROOT_RECV ))
return - EPERM ;
err = netlink_realloc_groups ( sk );
if ( err )
return err ;
}
...
netlink_update_subscriptions ( sk , nlk -> subscriptions +
hweight32 ( nladdr -> nl_groups ) -
hweight32 ( nlk -> groups [ 0 ]));
}
分配groups
static int netlink_realloc_groups ( struct sock * sk )
{
struct netlink_sock * nlk = nlk_sk ( sk );
netlink_table_grab ();
groups = nl_table [ sk -> sk_protocol ]. groups ;
new_groups = krealloc ( nlk -> groups , NLGRPSZ ( groups ), GFP_ATOMIC );
nlk -> groups = new_groups ;
nlk -> ngroups = groups ;
}
更新 nl_table
static void
netlink_update_subscriptions ( struct sock * sk , unsigned int subscriptions )
{
struct netlink_sock * nlk = nlk_sk ( sk );
sk_add_bind_node ( sk , & nl_table [ sk -> sk_protocol ]. mc_list );
nlk -> subscriptions = subscriptions ;
}
End
已经写的太多了,Linux这颗大树枝繁叶茂,飞进去容易迷路,还好有Ctags地图和Cscope指南针带我突围。 本文以 XFRMGRP_NEWSA 为起点,
分析了Netlink 发送NEWSA通知的过程,以及bind系统调用 Join Group的过程。
留下评论