Read调用的全程-白红宇

Read调用的全程

阅读量：6177 次

发布时间：2019-06-21

本文共 16228 字，大约阅读时间需要 54 分钟。

当我们在C程序中用到某些库函数进行文件读取操作时，后续的整个过程都是透明的，为了了解文件系统在其中起到了什么作用，又是如何和内核的其他部分进行协作的，我们可以对Read()函数进行追踪，下面的代码均来自linux2.6.11.10版本的内核。

首先，我们写下如下的测试程序，test.c，其中1.txt里只有一句Hello,World。

#include 
    
     #include 
     
       int main() {    char word[20];    FILE *fp;     if((fp = fopen("1.txt","a+")) == NULL) {         fprintf(stdout, "ERROR!");          exit(EXIT_FAILURE);     }                                                                             fscanf(fp,"%s",word);     printf("%s\n",word);                                 return 0;                                                }

然后进行编译，并通过strace 工具查看函数运行时用到了哪些系统调用函数，并将结果输出到hello.txt中。

~/test$ gcc hello.c -o hello

~/test$ strace -o hello.txt ./hello

查看hello.txt中的主要内容如下

……openat(AT_FDCWD, "x86_64/libc.so.6", O_RDONLY|O_CLOEXEC) = -1 ENOENT……openat(AT_FDCWD, "1.txt", O_RDWR|O_CREAT|O_APPEND, 0666) = 3fstat(3, {st_mode=S_IFREG|0644, st_size=13, ...}) = 0read(3, "Hello,World!\n", 4096)         = 13fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 1), ...}) = 0write(1, "Hello,World!\n", 13)          = 13lseek(3, -1, SEEK_CUR)                  = 12  exit_group(0)                           = ?

可以看到首先打开了libc.so，这里面封装了我们需要的库函数，而后调用了write、read、lseek等库函数。

我们知道，系统调用有两种方式实现，一种是老旧的int$0x80方式，还有一种是sysenter，具体细节不纠结，但过程总是先将系统调用号存入$eax，然后进行系统调用，这部分实现已经完全放进库函数了，进行系统调用后，会查系统调用表，比如read的系统调用就是3，那么查表就能查到这个函数。

比如i386处理器的系统调用号局部如下所示

/linux-2.6.11.10/include/asm-i386/unistd.h#define __NR_restart_syscall      0#define __NR_exit         1#define __NR_fork         2#define __NR_read         3#define __NR_write        4#define __NR_open         5#define __NR_close        6#define __NR_waitpid      7#define __NR_creat        8#define __NR_link         9                          #define __NR_unlink      10                           #define __NR_execve      11                            #define __NR_chdir       12                           #define __NR_time        13                            #define __NR_mknod       14                           #define __NR_chmod       15                           #define __NR_lchown      16                            #define __NR_break       17

由上我们看到，调用read的系统调用号为3，在这个文件的下面我们还能看到比较老旧的系统调用实现代码，现在这个功能好像已经放到库中去实现了，不在内核中实现，这里内核版本较老，所以在内核中还能看到，这里用的是通过系统调用需要的参数个数来进行区分的。

/linux-2.6.11.10/include/asm-i386/unistd.h#define __syscall_return(type, res) \                do { \    if ((unsigned long)(res) >= (unsigned long)(-(128 + 1))) { \        errno = -(res); \        res = -1; \    } \    return (type) (res); \} while (0)/* XXX - _foo needs to be __foo, while __NR_bar could be _NR_bar. */#define _syscall0(type,name) \type name(void) \ { \long __res; \ __asm__ volatile ("int $0x80" \     : "=a" (__res) \     : "0" (__NR_##name)); \  __syscall_return(type,__res); \                                                            }

而之后，read会调用相应的服务例程sys_read，此函数定义如下。

/linux-2.6.11.10/fs/read_write.casmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count){    struct file *file;    ssize_t ret = -EBADF;    int fput_needed;    file = fget_light(fd, &fput_needed);   //从当前打开的文件集中返回要写的文件对象地址       if (file) {        loff_t pos = file_pos_read(file);  //返回文件偏移地址        ret = vfs_read(file, buf, count, &pos); //buf为用户态缓冲区，count为读取长度        file_pos_write(file, pos);  //将新的偏移地址写回文件        fput_light(file, fput_needed); //释放文件    }    return ret;}EXPORT_SYMBOL_GPL(sys_read);

该函数首先通过fget_light（light表示轻量级的）通过文件描述符，来返回一个文件地址，类型为虚拟文件系统层的struct file，然后获取文件偏移地址，并调用vfs_read即虚拟文件系统的读操作，从这里我们可以看到，无论底层是什么文件系统，由于有VFS这个中间层存在，对文件进行操作都可以把事情交给VFS来处理，这是抽象的好处。

我们可以在sys_read所在的文件里找到vfs_read。

/linux-2.6.11.10/fs/read_write.cssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos){    ssize_t ret;    if (!(file->f_mode & FMODE_READ))  //进程的访问模式是否可读文件        return -EBADF;    if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read)) //检查文件是否定义有相关操作        return -EINVAL;    if (unlikely(!access_ok(VERIFY_WRITE, buf, count))) //粗略检查参数，看缓冲区是否有效        return -EFAULT;    ret = rw_verify_area(READ, file, pos, count);  //检查当前区域是否有锁    if (!ret) {        ret = security_file_permission (file, MAY_READ);  //检查是否有读的权限        if (!ret) {            if (file->f_op->read)                ret = file->f_op->read(file, buf, count, pos);   //如有则调用相应文件系统的read函数            else                ret = do_sync_read(file, buf, count, pos);   //否则调用这个函数            if (ret > 0) {                dnotify_parent(file->f_dentry, DN_ACCESS);   //通知父目录文件已获取                current->rchar += ret;            }            current->syscr++;   //一些I/O次数的统计        }    }    return ret;}EXPORT_SYMBOL(vfs_read);

我们可以看到，vfs_read函数只是检查了一些状态，就使用回调函数 file->f_op->read，使用相应文件系统的read函数继续进行操作，这个file_operations应该是open file的时候就已经填好的，我们可以/linux-2.6.11.10/fs/ext2/file.c里找到ext2所有的文件操作，如下，其实在新内核里，read和write之类的操作已经改了。

/linux-2.6.11.10/fs/ext2/file.cstruct file_operations ext2_file_operations = {    .llseek        = generic_file_llseek,    .read        = generic_file_read,    .write        = generic_file_write,    .aio_read    = generic_file_aio_read,    .aio_write    = generic_file_aio_write,    .ioctl        = ext2_ioctl,    .mmap        = generic_file_mmap,    .open        = generic_file_open,    .release    = ext2_release_file,    .fsync        = ext2_sync_file,    .readv        = generic_file_readv,    .writev        = generic_file_writev,    .sendfile    = generic_file_sendfile,};

可以看到，ext2的read操作并没有额外定义，而是使用了一个通用文件读函数，在/linux-2.6.11.10/mm/filemap.c文件里可以找到这个函数，因为读写是基于页操作的。

/linux-2.6.11.10/mm/filemap.cssize_t generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos){    struct iovec local_iov = { .iov_base = buf, .iov_len = count }; //用local_iov存用户缓区和读取长度    struct kiocb kiocb; //同步和异步I/O操作描述符    ssize_t ret;    init_sync_kiocb(&kiocb, filp); //初始化描述符    ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos); //所有文件系统使用的通用例程    if (-EIOCBQUEUED == ret)  //如果在排队        ret = wait_on_sync_kiocb(&kiocb);    return ret;}EXPORT_SYMBOL(generic_file_read);

这个函数继续调用了一个通用例程，即__generic_file_aio_read，字面理解就是异步I/O读，它不是立即读取，而是会先在一个链表里排队，如果在排队就需要继续等。

/linux-2.6.11.10/mm/filemap.cssize_t__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,        unsigned long nr_segs, loff_t *ppos){    struct file *filp = iocb->ki_filp;  //与正在进行的read操作相关的文件对象指针    ssize_t retval;    unsigned long seg;      size_t count;    count = 0;    for (seg = 0; seg < nr_segs; seg++) {        const struct iovec *iv = &iov[seg];        /*         * If any segment has a negative length, or the cumulative         * length ever wraps negative then return -EINVAL.         */        count += iv->iov_len;        if (unlikely((ssize_t)(count|iv->iov_len) < 0))            return -EINVAL;        if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))   //检查ivoec描述符所描述的用户态缓冲区是否有效            continue;        if (seg == 0)            return -EFAULT;        nr_segs = seg;        count -= iv->iov_len;    /* This segment is no good */        break;    }    /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */    if (filp->f_flags & O_DIRECT) {    //直接I/O模式        loff_t pos = *ppos, size;        struct address_space *mapping;        struct inode *inode;        mapping = filp->f_mapping;        inode = mapping->host;        retval = 0;        if (!count)            goto out; /* skip atime */        size = i_size_read(inode);        if (pos < size) {            retval = generic_file_direct_IO(READ, iocb,                        iov, pos, nr_segs);            if (retval >= 0 && !is_sync_kiocb(iocb))                retval = -EIOCBQUEUED;            if (retval > 0)                *ppos = pos + retval;        }        file_accessed(filp);        goto out;    }    retval = 0;    //如果不是直接I/O模式的话，就用页高速缓存    if (count) {        for (seg = 0; seg < nr_segs; seg++) {            read_descriptor_t desc;  //定义读操作描述符            desc.written = 0;            desc.arg.buf = iov[seg].iov_base;  //用户缓冲区            desc.count = iov[seg].iov_len;   //读取长度            if (desc.count == 0)                continue;            desc.error = 0;            do_generic_file_read(filp,ppos,&desc,file_read_actor); //调用该函数读文件            retval += desc.written;            if (!retval) {                retval = desc.error;                break;            }        }    }out:    return retval;}EXPORT_SYMBOL(__generic_file_aio_read);

我们可以将上面这个函数粗略划分为三部分，检查部分，以及直接I/O读取，和页高速缓存读取，如果设置了O_DIRECT标志，则直接读取调用generic_file_direct_IO()，否则要使用页高速缓存，调用do_generic_file_read，，我们主要关注页高速缓存读取。

/linux-2.6.11.10/include/linux/fs.hstatic inline void do_generic_file_read(struct file * filp, loff_t *ppos,                    read_descriptor_t * desc,                    read_actor_t actor){    do_generic_mapping_read(filp->f_mapping,                &filp->f_ra,                filp,                ppos,                desc,                actor);}

do_generic_file_read会继续调用do_generic_mapping_read，这个调用表示对文件的读操作转换为对页高速缓存的读操作。

之所以要在I/O过程中加入页高速缓存这么一个缓冲层，是为了提高读取的效率，我们希望能尽量减少对磁盘的读取，而将读取放到内存中进行，所以引入页高速缓存这么一个中间层。

上面的参数中有一个filp->f_mapping，这个是一个地址空间变量，其定义如下。

/linux-2.6.11.10/include/linux/fs.hstruct address_space {    struct inode        *host;        /* owner: inode, block_device */    struct radix_tree_root    page_tree;    /* radix tree of all pages */    spinlock_t        tree_lock;    /* and spinlock protecting it */    unsigned int        i_mmap_writable;/* count VM_SHARED mappings */    struct prio_tree_root    i_mmap;        /* tree of private and shared mappings */    struct list_head    i_mmap_nonlinear;/*list VM_NONLINEAR mappings */    spinlock_t        i_mmap_lock;    /* protect tree, count, list */    unsigned int        truncate_count;    /* Cover race condition with truncate */    unsigned long        nrpages;    /* number of total pages */    pgoff_t            writeback_index;/* writeback starts here */    struct address_space_operations *a_ops;    /* methods */    unsigned long        flags;        /* error bits/gfp mask */    struct backing_dev_info *backing_dev_info; /* device readahead, etc */    spinlock_t        private_lock;    /* for use by the address_space */    struct list_head    private_list;    /* ditto */    struct address_space    *assoc_mapping;    /* ditto */} __attribute__((aligned(sizeof(long))));

通过host和page_tree两个属性，一个adrees_space结构体可以将一个文件和属于它的缓存页联系起来，page_tree是struct radix_tree_root类型的，就是一颗树的根，它指向一颗基树，相应的页都存在叶子节点上，这样找页就很简单了。

在do_generic_mapping_read里，检查完基础数据后，会建立一个循环，这个循环每次读一页内容，直到读完所有内容。

首先是find_page，它会通过关联有页的基树找到相应的页，如果没找到，就跳到no_cached_page重新分配一个页插入到基树里去，如果为脏页则需要更新，如果既能找到，又不需要更新，那么直接page_ok将数据拷贝到用户态即可。

/linux-2.6.11.10/mm/filemap.c——do_generic_mapping_readfind_page:        page = find_get_page(mapping, index);  //首先在页高速缓存里寻找页描述符        if (unlikely(page == NULL)) {            handle_ra_miss(mapping, &ra, index);            goto no_cached_page;        }        if (!PageUptodate(page)) //检查是否为脏页            goto page_not_up_to_date;

找到页以后开始读页，主要的重点语句是这一句

linux-2.6.11.10/mm/filemap.c——do_generic_mapping_readreadpage:        /* Start the actual read. The read will unlock the page. */        error = mapping->a_ops->readpage(filp, page);        if (unlikely(error))            goto readpage_error;        if (!PageUptodate(page)) {            lock_page(page);            if (!PageUptodate(page)) {                if (page->mapping == NULL) {                    /*                     * invalidate_inode_pages got it                     */                    unlock_page(page);                    page_cache_release(page);                    goto find_page;                }                unlock_page(page);                error = -EIO;                goto readpage_error;            }            unlock_page(page);        }        /*         * i_size must be checked after we have done ->readpage.         *         * Checking i_size after the readpage allows us to calculate         * the correct value for "nr", which means the zero-filled         * part of the page is not copied back to userspace (unless         * another truncate extends the file - this is desired though).         */        isize = i_size_read(inode);        end_index = (isize - 1) >> PAGE_CACHE_SHIFT;        if (unlikely(!isize || index > end_index)) {            page_cache_release(page);            goto out;        }        /* nr is the maximum number of bytes to copy from this page */        nr = PAGE_CACHE_SIZE;        if (index == end_index) {            nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;            if (nr <= offset) {                page_cache_release(page);                goto out;            }        }        nr = nr - offset;        goto page_ok;

这里又出现了一个回调函数，又调用了相关文件系统的相关函数，我们可以找到ext2的a_ops定义如下：

/linux-2.6.11.10/fs/ext2/inode.cstruct address_space_operations ext2_aops = {    .readpage        = ext2_readpage,    .readpages        = ext2_readpages,    .writepage        = ext2_writepage,    .sync_page        = block_sync_page,    .prepare_write        = ext2_prepare_write,    .commit_write        = generic_commit_write,    .bmap            = ext2_bmap,    .direct_IO        = ext2_direct_IO,    .writepages        = ext2_writepages,};

再找到ext2_readpage开始我们的读页操作。

/linux-2.6.11.10/fs/ext2/inode.cstatic int ext2_readpage(struct file *file, struct page *page){    return mpage_readpage(page, ext2_get_block);}

这里它又继续调用了一个通用例程mapge_readpage，导入了页地址以及ext2的数据块寻址函数。

/linux-2.6.11.10/fs/mpage.cint mpage_readpage(struct page *page, get_block_t get_block){    struct bio *bio = NULL;    sector_t last_block_in_bio = 0;    bio = do_mpage_readpage(bio, page, 1,            &last_block_in_bio, get_block);    if (bio)        mpage_bio_submit(READ, bio);    return 0;}EXPORT_SYMBOL(mpage_readpage);

这里就两步操作，申请一个struct bio对象，然后提交这个任务。bio是通用块层用来管理传输数据的，他把一个磁盘存储区和一块内存区域联系起来。

然后提交这个任务，这里面其实还有一个调度过程，所有的bio请求都在一个队列里，它可以重排读写数据块的请求，在重复访问文件同一个部分或多进程访问同一数据，可以大大提高读取效率。

最终，这件读操作会交给磁盘的设备驱动程序来进行真正的数据操作。

读完以后，再回到do_generic_mapping_read，跳到page_ok，它会调用__copy_to_user()函数将数据拷贝到用户态缓冲区，

linux-2.6.11.10/mm/filemap.c——do_generic_mapping_readpage_ok:        /* If users can be writing to this page using arbitrary         * virtual addresses, take care about potential aliasing         * before reading the page on the kernel side.         */        if (mapping_writably_mapped(mapping))            flush_dcache_page(page);        /*         * When (part of) the same page is read multiple times         * in succession, only mark it as accessed the first time.         */        if (prev_index != index)            mark_page_accessed(page);        prev_index = index;        /*         * Ok, we have the page, and it's up-to-date, so         * now we can copy it to user space...         *         * The actor routine returns how many bytes were actually used..         * NOTE! This may not be the same as how much of a user buffer         * we filled up (we may be padding etc), so we can only update         * "pos" here (the actor routine has to update the user buffer         * pointers and the remaining count).         */        ret = actor(desc, page, offset, nr);        offset += ret;        index += offset >> PAGE_CACHE_SHIFT;        offset &= ~PAGE_CACHE_MASK;        page_cache_release(page);        if (ret == nr && desc->count)            continue;        goto out;

linux-2.6.11.10/mm/filemap.cint file_read_actor(read_descriptor_t *desc, struct page *page,            unsigned long offset, unsigned long size){    char *kaddr;    unsigned long left, count = desc->count;    if (size > count)        size = count;    /*     * Faults on the destination of a read are common, so do it before     * taking the kmap.     */    if (!fault_in_pages_writeable(desc->arg.buf, size)) {        kaddr = kmap_atomic(page, KM_USER0);        left = __copy_to_user_inatomic(desc->arg.buf,                        kaddr + offset, size);        kunmap_atomic(kaddr, KM_USER0);        if (left == 0)            goto success;    }    /* Do it the slow way */    kaddr = kmap(page);    left = __copy_to_user(desc->arg.buf, kaddr + offset, size);    kunmap(page);    if (left) {        size -= left;        desc->error = -EFAULT;    }success:    desc->count = count - size;    desc->written += size;    desc->arg.buf += size;    return size;}

然后更新一些计数，再一步步往上返回到最开始的read()系统调用，调用就结束了。

转载地址：http://epzda.baihongyu.com/

你可能感兴趣的文章