The interface between user space program and kernel network is Socket. We access to socket using
- socket api e.g.
connect
,accept
,bind
,listen
,send
,receive
or - file api e.g.
read
,write
,close
Socket api is complete while file api some time offer only few operations, and alone is not enough
to make socket work as the open
is not provided, so application end up with use only socket api.
It is important to know that syscall(s) related to socket use file descriptor associated with the kernel socket data structure.
Socket data structures
Socket
struct socket {
socket_state state;
unsigned long flags;
const struct proto_ops *ops;
struct fasync_struct *fasync_list;
struct file *file;
struct sock *sk;
wait_queue_head_t wait;
short type;
};
Sock structure encapsulate internal implementation of socket
struct sock {
/*
* Now struct inet_timewait_sock also uses sock_common, so please just
* don't add nothing before this first member (__sk_common) --acme
*/
struct sock_common __sk_common;
#define sk_family __sk_common.skc_family
#define sk_state __sk_common.skc_state
#define sk_reuse __sk_common.skc_reuse
#define sk_bound_dev_if __sk_common.skc_bound_dev_if
#define sk_node __sk_common.skc_node
#define sk_bind_node __sk_common.skc_bind_node
#define sk_refcnt __sk_common.skc_refcnt
#define sk_hash __sk_common.skc_hash
#define sk_prot __sk_common.skc_prot
unsigned char sk_shutdown : 2,
sk_no_check : 2,
sk_userlocks : 4;
unsigned char sk_protocol;
unsigned short sk_type;
int sk_rcvbuf;
socket_lock_t sk_lock;
wait_queue_head_t *sk_sleep;
struct dst_entry *sk_dst_cache;
struct xfrm_policy *sk_policy[2];
rwlock_t sk_dst_lock;
atomic_t sk_rmem_alloc;
atomic_t sk_wmem_alloc;
atomic_t sk_omem_alloc;
struct sk_buff_head sk_receive_queue;
struct sk_buff_head sk_write_queue;
struct sk_buff_head sk_async_wait_queue;
int sk_wmem_queued;
int sk_forward_alloc;
gfp_t sk_allocation;
int sk_sndbuf;
int sk_route_caps;
int sk_gso_type;
int sk_rcvlowat;
unsigned long sk_flags;
unsigned long sk_lingertime;
/*
* The backlog queue is special, it is always used with
* the per-socket spinlock held and requires low latency
* access. Therefore we special case it's implementation.
*/
struct {
struct sk_buff *head;
struct sk_buff *tail;
} sk_backlog;
struct sk_buff_head sk_error_queue;
struct proto *sk_prot_creator;
rwlock_t sk_callback_lock;
int sk_err,
sk_err_soft;
unsigned short sk_ack_backlog;
unsigned short sk_max_ack_backlog;
__u32 sk_priority;
struct ucred sk_peercred;
long sk_rcvtimeo;
long sk_sndtimeo;
struct sk_filter *sk_filter;
void *sk_protinfo;
struct timer_list sk_timer;
struct timeval sk_stamp;
struct socket *sk_socket;
void *sk_user_data;
struct page *sk_sndmsg_page;
struct sk_buff *sk_send_head;
__u32 sk_sndmsg_off;
int sk_write_pending;
void *sk_security;
void (*sk_state_change)(struct sock *sk);
void (*sk_data_ready)(struct sock *sk, int bytes);
void (*sk_write_space)(struct sock *sk);
void (*sk_error_report)(struct sock *sk);
int (*sk_backlog_rcv)(struct sock *sk,
struct sk_buff *skb);
void (*sk_create_child)(struct sock *sk, struct sock *newsk);
void (*sk_destruct)(struct sock *sk);
};
Create a socket
asmlinkage long sys_socket(int family, int type, int protocol)
{
int retval;
struct socket *sock;
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
goto out;
retval = sock_map_fd(sock); /* assign function pointers for file interface*/
...
return retval;
}
The syscall return a file descriptor associated to the newly created socket
static int __sock_create(int family, int type, int protocol, struct socket **res, int kern)
{
…
struct socket *sock;
...
if (!(sock = sock_alloc())) {
...
if ((err = net_families[family]->create(sock, protocol)) < 0) {
...
*res = sock;
…
initialization of socket function pointers
There is a function responsible for initializing socket for each socket family/domain
struct net_proto_family {
int family;
int (*create)(struct socket *sock, int protocol);
...
};
int sock_register(struct net_proto_family *ops)
{
...
if (net_families[ops->family] == NULL) {
net_families[ops->family]=ops;
err = 0;
}
…
The socket family holding initialization function get registered at the system boot
static int __init inet_init(void)
{
…
(void)sock_register(&inet_family_ops);
…
E.g. of structure of socket family for IPv4 Internet protocols
static struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};
static int inet_create(struct socket *sock, int protocol)
{
struct sock *sk;
...
lookup_protocol:
err = -ESOCKTNOSUPPORT;
rcu_read_lock();
list_for_each_rcu(p, &inetsw[sock->type]) {
answer = list_entry(p, struct inet_protosw, list);
/* Check the non-wild match. */
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
break;
...
sock->ops = answer->ops; /* the socket operation is initialized from one registered in global variable inetsw */
answer_prot = answer->prot; /* this will be used internally when ops actually delegate its work to a functions specified here*/
...
sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1);
…
sock_init_data(sock, sk)
….
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
The operation functions are specific to protocol (normally 0 as there is only one protocol) of the family (local, IPv4) and type of the socket (datagram, stream or raw)
static int __init inet_init(void)
{
…
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
inet_register_protosw(q);
...
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.capability = -1,
.no_check = 0,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops, /*functions here will call those specified in .prot via internal socket*/
.capability = -1,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
...
const struct proto_ops inet_dgram_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_dgram_connect,
...
.recvmsg = sock_common_recvmsg,
...
void inet_register_protosw(struct inet_protosw *p)
{
...
answer = NULL;
last_perm = &inetsw[p->type];
list_for_each(lh, &inetsw[p->type]) {
answer = list_entry(lh, struct inet_protosw, list);
/* Check only the non-wild match. */
if (INET_PROTOSW_PERMANENT & answer->flags) {
if (protocol == answer->protocol)
break;
last_perm = lh;
}
…
list_add_rcu(&p->list, last_perm);
void sock_init_data(struct socket *sock, struct sock *sk)
{
...
if(sock)
{
sk->sk_type = sock->type;
sk->sk_sleep = &sock->wait;
sock->sk = sk; /* wiring internal socket with socket*/
} else
sk->sk_sleep = NULL;
..
sk->sk_state_change = sock_def_wakeup;
sk->sk_data_ready = sock_def_readable;
struct sock *sk_alloc(int family, gfp_t priority,
struct proto *prot, int zero_it)
{
struct sock *sk = NULL;
...
sk->sk_prot = sk->sk_prot_creator = prot;
struct proto udp_prot = {
.name = "UDP",
.owner = THIS_MODULE,
.close = udp_close,
.connect = ip4_datagram_connect,
.disconnect = udp_disconnect,
.ioctl = udp_ioctl,
.destroy = udp_destroy_sock,
.setsockopt = udp_setsockopt,
.getsockopt = udp_getsockopt,
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
.sendpage = udp_sendpage,
.backlog_rcv = __udp_queue_rcv_skb,
.hash = udp_v4_hash,
.unhash = udp_v4_unhash,
.get_port = udp_v4_get_port,
.memory_allocated = &udp_memory_allocated,
.sysctl_mem = sysctl_udp_mem,
.sysctl_wmem = &sysctl_udp_wmem_min,
.sysctl_rmem = &sysctl_udp_rmem_min,
.obj_size = sizeof(struct udp_sock),
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udp_setsockopt,
.compat_getsockopt = compat_udp_getsockopt,
#endif
};
References