socket fs(2)

本文主要是介绍socket fs(2)，希望对大家解决编程问题提供一定的参考价值，需要的开发者们随着小编来一起学习吧！

/************************************************************************************/

socketfs的创建

sock_init ->
{
   register_filesystem(&sock_fs_type);
   sock_mnt = kern_mount(&sock_fs_type);
}
static struct file_system_type sock_fs_type = {
   .name =       "sockfs",
   .mount =   sockfs_mount,
   .kill_sb =   kill_anon_super,
};

kern_mount -> kern_mount_data -> vfs_kern_mount -> mount_fs ->type->mount();
static struct dentry *sockfs_mount(struct file_system_type *fs_type,
           int flags, const char *dev_name, void *data)
{
   return mount_pseudo(fs_type, "socket:", &sockfs_ops,
       &sockfs_dentry_operations, SOCKFS_MAGIC);
}

static const struct super_operations sockfs_ops = {
   .alloc_inode   = sock_alloc_inode,
   .destroy_inode   = sock_destroy_inode,
   .statfs       = simple_statfs,
};

static const struct dentry_operations sockfs_dentry_operations = {
   .d_dname = sockfs_dname,
};

/*
* Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
* will never be mountable)
*/
struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name,
   const struct super_operations *ops,
   const struct dentry_operations *dops, unsigned long magic)
{
   struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
   struct dentry *dentry;
   struct inode *root;
   struct qstr d_name = {.name = name, .len = strlen(name)};

   if (IS_ERR(s))
       return ERR_CAST(s);

   s->s_flags = MS_NOUSER;
   s->s_maxbytes = MAX_LFS_FILESIZE;
   s->s_blocksize = PAGE_SIZE;
   s->s_blocksize_bits = PAGE_SHIFT;
   s->s_magic = magic;
   s->s_op = ops ? ops : &simple_super_operations;
   s->s_time_gran = 1;
   root = new_inode(s);
   if (!root)
       goto Enomem;
   /*
   * since this is the first inode, make it number 1. New inodes created
   * after this must take care not to collide with it (by passing
   * max_reserved of 1 to iunique).
   */
   root->i_ino = 1;
   root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
   root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
   dentry = __d_alloc(s, &d_name);
   if (!dentry) {
       iput(root);
       goto Enomem;
   }
   d_instantiate(dentry, root);
   s->s_root = dentry;
   s->s_d_op = dops;
   s->s_flags |= MS_ACTIVE;
   return dget(s->s_root);

Enomem:
   deactivate_locked_super(s);
   return ERR_PTR(-ENOMEM);
}

/****************************************************************************************/

socket系统调用的实现：

主要有两部分：创建socket/ 关联socket and 文件描述符
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
   int retval;
   struct socket *sock;
   int flags;

   /* Check the SOCK_* constants for consistency. */
   BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
   BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
   BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
   BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

   flags = type & ~SOCK_TYPE_MASK;
   if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
       return -EINVAL;
   type &= SOCK_TYPE_MASK;

   if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
       flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

   retval = sock_create(family, type, protocol, &sock);
   if (retval < 0)
       goto out;

   retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
   if (retval < 0)
       goto out_release;

out:
   /* It may be already another descriptor 8) Not kernel problem. */
   return retval;

out_release:
   sock_release(sock);
   return retval;
}

创建socket

/*sock_create的实现，这里 name space proxy其中的net name space,就是说这里关联到net*/
int sock_create(int family, int type, int protocol, struct socket **res)
{
   return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}

crash> task_struct | grep nsproxy
    struct nsproxy *nsproxy;
crash> nsproxy
struct nsproxy {
    atomic_t count;
    struct uts_namespace *uts_ns;
    struct ipc_namespace *ipc_ns;
    struct mnt_namespace *mnt_ns;
    struct pid_namespace *pid_ns;
    struct net *net_ns;
}
SIZE: 24

crash> struct net
struct net {
    atomic_t passive;
    atomic_t count;
    spinlock_t rules_mod_lock;
    struct list_head list;
    struct list_head cleanup_list;
    struct list_head exit_list;
    struct proc_dir_entry *proc_net;
    struct proc_dir_entry *proc_net_stat;
    struct ctl_table_set sysctls;
    struct sock *rtnl;
    struct sock *genl_sock;
    struct list_head dev_base_head;
    struct hlist_head *dev_name_head;
    struct hlist_head *dev_index_head;
    unsigned int dev_base_seq;
    struct list_head rules_ops;
    struct net_device *loopback_dev;
    struct netns_core core;
    struct netns_mib mib;
    struct netns_packet packet;
    struct netns_unix unx;
    struct netns_ipv4 ipv4;
    struct netns_ipv6 ipv6;
    struct netns_xt xt;
    struct netns_ct ct;
    struct sock *nfnl;
    struct sock *nfnl_stash;
    struct sk_buff_head wext_nlevents;
    struct net_generic *gen;
    struct netns_xfrm xfrm;
    struct netns_ipvs *ipvs;
}
SIZE: 1376

/*__sock_create创建了socket,调用对应net family的create 函数*/
crash> socket
struct socket {
    socket_state state;
    short type;
    unsigned long flags;
    struct socket_wq *wq;
    struct file *file;
    struct sock *sk;
    const struct proto_ops *ops;
}
SIZE: 28

int __sock_create(struct net *net, int family, int type, int protocol,
           struct socket **res, int kern)
{
   struct socket *sock;
   const struct net_proto_family *pf;

   /*
   *   Allocate the socket and allow the family to set things up. if
   *   the protocol is 0, the family is instructed to select an appropriate
   *   default.
   */
   sock = sock_alloc();
   sock->type = type;
   pf = rcu_dereference(net_families[family]);
   err = pf->create(net, sock, protocol, kern);
   *res = sock;

   return 0;
}

crash> net_proto_family
struct net_proto_family {
    int family;
    int (*create)(struct net *, struct socket *, int, int);
    struct module *owner;
}

crash> net_families
net_families = $1 =
{0x0,
0xc0561d3c <unix_family_ops>,
0xc0560954 <inet_family_ops>,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0xc0561eb0 <inet6_family_ops>,
0x0, 0x0, 0x0, 0x0,
0xc0563458 <pfkey_family_ops>,
0xc055f384 <netlink_family_ops>,
0xc05632e0 <packet_family_ops>,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0xc0543358 <pppox_proto_family>,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0xc0703c8c <bt_sock_family_ops>,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0
}

关联socket and file

int sock_map_fd(struct socket *sock, int flags)
{
   struct file *newfile;
   int fd = sock_alloc_file(sock, &newfile, flags);

   if (likely(fd >= 0))
       fd_install(fd, newfile);

   return fd;
}

/*
*   Obtains the first available file descriptor and sets it up for use.
*
*   These functions create file structures and maps them to fd space
*   of the current process. On success it returns file descriptor
*   and file struct implicitly stored in sock->file.
*   Note that another thread may close file descriptor before we return
*   from this function. We use the fact that now we do not refer
*   to socket after mapping. If one day we will need it, this
*   function will increment ref. count on file by 1.
*
*   In any case returned fd MAY BE not valid!
*   This race condition is unavoidable
*   with shared fd spaces, we cannot solve it inside kernel,
*   but we take care of internal coherence yet.
*/

static int sock_alloc_file(struct socket *sock, struct file **f, int flags)
{
   struct qstr name = { .name = "" };
   struct path path;
   struct file *file;
   int fd;

   fd = get_unused_fd_flags(flags);

   path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name);

   path.mnt = mntget(sock_mnt);

   d_instantiate(path.dentry, SOCK_INODE(sock));
   / *inode的 fops赋值为socket_file_ops*/
   SOCK_INODE(sock)->i_fop = &socket_file_ops;/*inode fops*/

   file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
          &socket_file_ops);

   sock->file = file;
   file->f_flags = O_RDWR | (flags & O_NONBLOCK);
   file->f_pos = 0;/*is NULL*/
   file->private_data = sock;

   *f = file;
   return fd;
}

/*
*   Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
*   in the operation structures but are done directly via the socketcall() multiplexor.
*/

static const struct file_operations socket_file_ops = {
   .owner =   THIS_MODULE,
   .llseek =   no_llseek,
   .aio_read =   sock_aio_read,
   .aio_write =   sock_aio_write,
   .poll =       sock_poll,
   .unlocked_ioctl = sock_ioctl,

   .mmap =       sock_mmap,
   .open =       sock_no_open,   /* special open code to disallow open via /proc */
   .release =   sock_close,
   .fasync =   sock_fasync,
   .sendpage =   sock_sendpage,
   .splice_write = generic_splice_sendpage,
   .splice_read =   sock_splice_read,
};

/*
* Install a file pointer in the fd array.
*/
void fd_install(unsigned int fd, struct file *file)
{
   struct files_struct *files = current->files;
   struct fdtable *fdt;
   spin_lock(&files->file_lock);
   fdt = files_fdtable(files);
   BUG_ON(fdt->fd[fd] != NULL);
   rcu_assign_pointer(fdt->fd[fd], file);
   spin_unlock(&files->file_lock);
}

/************************************************************************************/

以#define AF_NETLINK 16 为例看socket的创建过程：

int __sock_create(struct net *net, int family, int type, int protocol,
           struct socket **res, int kern)
{
   struct socket *sock;
   const struct net_proto_family *pf;

   /*
   *   Allocate the socket and allow the family to set things up. if
   *   the protocol is 0, the family is instructed to select an appropriate
   *   default.
   */
   sock = sock_alloc();
   sock->type = type;
   pf = rcu_dereference(net_families[family]);
   err = pf->create(net, sock, protocol, kern);
   *res = sock;

   return 0;
}

以0xc055f384 <netlink_family_ops>,
#define AF_NETLINK   16

crash> netlink_family_ops
netlink_family_ops = $11 = {
family = 16,
create = 0xc03f29b8 <netlink_create>,
owner = 0x0
}

static int netlink_create(struct net *net, struct socket *sock, int protocol,
              int kern)
{
   struct module *module = NULL;
   struct mutex *cb_mutex;
   struct netlink_sock *nlk;
   int err = 0;

   sock->state = SS_UNCONNECTED;

   err = __netlink_create(net, sock, cb_mutex, protocol);

   local_bh_disable();
   sock_prot_inuse_add(net, &netlink_proto, 1);
   local_bh_enable();

   nlk = nlk_sk(sock->sk);
   nlk->module = module;

   return err;

}

static int __netlink_create(struct net *net, struct socket *sock,
                struct mutex *cb_mutex, int protocol)
{
   struct sock *sk;
   struct netlink_sock *nlk;

   sock->ops = &netlink_ops;

   sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto);
   if (!sk)
       return -ENOMEM;

   sock_init_data(sock, sk);

   nlk = nlk_sk(sk);
   if (cb_mutex)
       nlk->cb_mutex = cb_mutex;
   else {
       nlk->cb_mutex = &nlk->cb_def_mutex;
       mutex_init(nlk->cb_mutex);
   }
   init_waitqueue_head(&nlk->wait);

   sk->sk_destruct = netlink_sock_destruct;
   sk->sk_protocol = protocol;
   return 0;
}

static const struct proto_ops netlink_ops = {
   .family =   PF_NETLINK,
   .owner =   THIS_MODULE,
   .release =   netlink_release,
   .bind =       netlink_bind,
   .connect =   netlink_connect,
   .socketpair =   sock_no_socketpair,
   .accept =   sock_no_accept,
   .getname =   netlink_getname,
   .poll =       datagram_poll,
   .ioctl =   sock_no_ioctl,
   .listen =   sock_no_listen,
   .shutdown =   sock_no_shutdown,
   .setsockopt =   netlink_setsockopt,
   .getsockopt =   netlink_getsockopt,
   .sendmsg =   netlink_sendmsg,
   .recvmsg =   netlink_recvmsg,
   .mmap =       sock_no_mmap,
   .sendpage =   sock_no_sendpage,

};

socket read system call

上图对应的代码流程

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
   struct file *file;
   ssize_t ret = -EBADF;
   int fput_needed;

   file = fget_light(fd, &fput_needed);/*从fd得到file object*/
   if (file) {
       loff_t pos = file_pos_read(file);/*从哪里开始read*/
       ret = vfs_read(file, buf, count, &pos);
       file_pos_write(file, pos);
       fput_light(file, fput_needed);
   }

   return ret;
}

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
   ssize_t ret;

   if (!(file->f_mode & FMODE_READ))
       return -EBADF;
   if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
       return -EINVAL;
   if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
       return -EFAULT;

   ret = rw_verify_area(READ, file, pos, count);
   if (ret >= 0) {
       count = ret;
       if (file->f_op->read)
           ret = file->f_op->read(file, buf, count, pos);
       else
           ret = do_sync_read(file, buf, count, pos);
       if (ret > 0) {
           fsnotify_access(file);
           add_rchar(current, ret);
       }
       inc_syscr(current);
   }

   return ret;
}

ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
   struct iovec iov = { .iov_base = buf, .iov_len = len };
   struct kiocb kiocb;
   ssize_t ret;

   init_sync_kiocb(&kiocb, filp);
   kiocb.ki_pos = *ppos;
   kiocb.ki_left = len;
   kiocb.ki_nbytes = len;

   for (;;) {
       ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
       if (ret != -EIOCBRETRY)
           break;
       wait_on_retry_sync_kiocb(&kiocb);
   }

   if (-EIOCBQUEUED == ret)
       ret = wait_on_sync_kiocb(&kiocb);
   *ppos = kiocb.ki_pos;
   return ret;
}

/*sockfs*/
.aio_read =    sock_aio_read,

static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
               unsigned long nr_segs, loff_t pos)
{
   struct sock_iocb siocb, *x;

   if (pos != 0)
       return -ESPIPE;

   if (iocb->ki_left == 0)   /* Match SYS5 behaviour */
       return 0;

   x = alloc_sock_iocb(iocb, &siocb);
   if (!x)
       return -ENOMEM;
   return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs);
}

static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
       struct file *file, const struct iovec *iov,
       unsigned long nr_segs)
{
   struct socket *sock = file->private_data;
   size_t size = 0;
   int i;

   for (i = 0; i < nr_segs; i++)
       size += iov[i].iov_len;

   msg->msg_name = NULL;
   msg->msg_namelen = 0;
   msg->msg_control = NULL;
   msg->msg_controllen = 0;
   msg->msg_iov = (struct iovec *)iov;
   msg->msg_iovlen = nr_segs;
   msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;

   return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags);
}

static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
               struct msghdr *msg, size_t size, int flags)
{
   int err = security_socket_recvmsg(sock, msg, size, flags);

   return err ?: __sock_recvmsg_nosec(iocb, sock, msg, size, flags);
}

static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock,
                       struct msghdr *msg, size_t size, int flags)
{
   struct sock_iocb *si = kiocb_to_siocb(iocb);

   sock_update_classid(sock->sk);

   si->sock = sock;
   si->scm = NULL;
   si->msg = msg;
   si->size = size;
   si->flags = flags;

   return sock->ops->recvmsg(iocb, sock, msg, size, flags);
}