当我们在C程序中用到某些库函数进行文件读取操作时,后续的整个过程都是透明的,为了了解文件系统在其中起到了什么作用,又是如何和内核的其他部分进行协作的,我们可以对Read()函数进行追踪,下面的代码均来自linux2.6.11.10版本的内核。
首先,我们写下如下的测试程序,test.c,其中1.txt里只有一句Hello,World。
#include#include int main() { char word[20]; FILE *fp; if((fp = fopen("1.txt","a+")) == NULL) { fprintf(stdout, "ERROR!"); exit(EXIT_FAILURE); } fscanf(fp,"%s",word); printf("%s\n",word); return 0; }
然后进行编译,并通过strace 工具查看函数运行时用到了哪些系统调用函数,并将结果输出到hello.txt中。
~/test$ gcc hello.c -o hello
~/test$ strace -o hello.txt ./hello
查看hello.txt中的主要内容如下
……openat(AT_FDCWD, "x86_64/libc.so.6", O_RDONLY|O_CLOEXEC) = -1 ENOENT……openat(AT_FDCWD, "1.txt", O_RDWR|O_CREAT|O_APPEND, 0666) = 3fstat(3, {st_mode=S_IFREG|0644, st_size=13, ...}) = 0read(3, "Hello,World!\n", 4096) = 13fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 1), ...}) = 0write(1, "Hello,World!\n", 13) = 13lseek(3, -1, SEEK_CUR) = 12 exit_group(0) = ?
可以看到首先打开了libc.so,这里面封装了我们需要的库函数,而后调用了write、read、lseek等库函数。
我们知道,系统调用有两种方式实现,一种是老旧的int$0x80方式,还有一种是sysenter,具体细节不纠结,但过程总是先将系统调用号存入$eax,然后进行系统调用,这部分实现已经完全放进库函数了,进行系统调用后,会查系统调用表,比如read的系统调用就是3,那么查表就能查到这个函数。
比如i386处理器的系统调用号局部如下所示
/linux-2.6.11.10/include/asm-i386/unistd.h#define __NR_restart_syscall 0#define __NR_exit 1#define __NR_fork 2#define __NR_read 3#define __NR_write 4#define __NR_open 5#define __NR_close 6#define __NR_waitpid 7#define __NR_creat 8#define __NR_link 9 #define __NR_unlink 10 #define __NR_execve 11 #define __NR_chdir 12 #define __NR_time 13 #define __NR_mknod 14 #define __NR_chmod 15 #define __NR_lchown 16 #define __NR_break 17
由上我们看到,调用read的系统调用号为3,在这个文件的下面我们还能看到比较老旧的系统调用实现代码,现在这个功能好像已经放到库中去实现了,不在内核中实现,这里内核版本较老,所以在内核中还能看到,这里用的是通过系统调用需要的参数个数来进行区分的。
/linux-2.6.11.10/include/asm-i386/unistd.h#define __syscall_return(type, res) \ do { \ if ((unsigned long)(res) >= (unsigned long)(-(128 + 1))) { \ errno = -(res); \ res = -1; \ } \ return (type) (res); \} while (0)/* XXX - _foo needs to be __foo, while __NR_bar could be _NR_bar. */#define _syscall0(type,name) \type name(void) \ { \long __res; \ __asm__ volatile ("int $0x80" \ : "=a" (__res) \ : "0" (__NR_##name)); \ __syscall_return(type,__res); \ }
而之后,read会调用相应的服务例程sys_read,此函数定义如下。
/linux-2.6.11.10/fs/read_write.casmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count){ struct file *file; ssize_t ret = -EBADF; int fput_needed; file = fget_light(fd, &fput_needed); //从当前打开的文件集中返回要写的文件对象地址 if (file) { loff_t pos = file_pos_read(file); //返回文件偏移地址 ret = vfs_read(file, buf, count, &pos); //buf为用户态缓冲区,count为读取长度 file_pos_write(file, pos); //将新的偏移地址写回文件 fput_light(file, fput_needed); //释放文件 } return ret;}EXPORT_SYMBOL_GPL(sys_read);
该函数首先通过fget_light(light表示轻量级的)通过文件描述符,来返回一个文件地址,类型为虚拟文件系统层的struct file,然后获取文件偏移地址,并调用vfs_read即虚拟文件系统的读操作,从这里我们可以看到,无论底层是什么文件系统,由于有VFS这个中间层存在,对文件进行操作都可以把事情交给VFS来处理,这是抽象的好处。
我们可以在sys_read所在的文件里找到vfs_read。
/linux-2.6.11.10/fs/read_write.cssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos){ ssize_t ret; if (!(file->f_mode & FMODE_READ)) //进程的访问模式是否可读文件 return -EBADF; if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read)) //检查文件是否定义有相关操作 return -EINVAL; if (unlikely(!access_ok(VERIFY_WRITE, buf, count))) //粗略检查参数,看缓冲区是否有效 return -EFAULT; ret = rw_verify_area(READ, file, pos, count); //检查当前区域是否有锁 if (!ret) { ret = security_file_permission (file, MAY_READ); //检查是否有读的权限 if (!ret) { if (file->f_op->read) ret = file->f_op->read(file, buf, count, pos); //如有则调用相应文件系统的read函数 else ret = do_sync_read(file, buf, count, pos); //否则调用这个函数 if (ret > 0) { dnotify_parent(file->f_dentry, DN_ACCESS); //通知父目录文件已获取 current->rchar += ret; } current->syscr++; //一些I/O次数的统计 } } return ret;}EXPORT_SYMBOL(vfs_read);
我们可以看到,vfs_read函数只是检查了一些状态,就使用回调函数 file->f_op->read,使用相应文件系统的read函数继续进行操作,这个file_operations应该是open file的时候就已经填好的,我们可以/linux-2.6.11.10/fs/ext2/file.c里找到ext2所有的文件操作,如下,其实在新内核里,read和write之类的操作已经改了。
/linux-2.6.11.10/fs/ext2/file.cstruct file_operations ext2_file_operations = { .llseek = generic_file_llseek, .read = generic_file_read, .write = generic_file_write, .aio_read = generic_file_aio_read, .aio_write = generic_file_aio_write, .ioctl = ext2_ioctl, .mmap = generic_file_mmap, .open = generic_file_open, .release = ext2_release_file, .fsync = ext2_sync_file, .readv = generic_file_readv, .writev = generic_file_writev, .sendfile = generic_file_sendfile,};
可以看到,ext2的read操作并没有额外定义,而是使用了一个通用文件读函数,在/linux-2.6.11.10/mm/filemap.c文件里可以找到这个函数,因为读写是基于页操作的。
/linux-2.6.11.10/mm/filemap.cssize_t generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos){ struct iovec local_iov = { .iov_base = buf, .iov_len = count }; //用local_iov存用户缓区和读取长度 struct kiocb kiocb; //同步和异步I/O操作描述符 ssize_t ret; init_sync_kiocb(&kiocb, filp); //初始化描述符 ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos); //所有文件系统使用的通用例程 if (-EIOCBQUEUED == ret) //如果在排队 ret = wait_on_sync_kiocb(&kiocb); return ret;}EXPORT_SYMBOL(generic_file_read);
这个函数继续调用了一个通用例程,即__generic_file_aio_read,字面理解就是异步I/O读,它不是立即读取,而是会先在一个链表里排队,如果在排队就需要继续等。
/linux-2.6.11.10/mm/filemap.cssize_t__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos){ struct file *filp = iocb->ki_filp; //与正在进行的read操作相关的文件对象指针 ssize_t retval; unsigned long seg; size_t count; count = 0; for (seg = 0; seg < nr_segs; seg++) { const struct iovec *iv = &iov[seg]; /* * If any segment has a negative length, or the cumulative * length ever wraps negative then return -EINVAL. */ count += iv->iov_len; if (unlikely((ssize_t)(count|iv->iov_len) < 0)) return -EINVAL; if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len)) //检查ivoec描述符所描述的用户态缓冲区是否有效 continue; if (seg == 0) return -EFAULT; nr_segs = seg; count -= iv->iov_len; /* This segment is no good */ break; } /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (filp->f_flags & O_DIRECT) { //直接I/O模式 loff_t pos = *ppos, size; struct address_space *mapping; struct inode *inode; mapping = filp->f_mapping; inode = mapping->host; retval = 0; if (!count) goto out; /* skip atime */ size = i_size_read(inode); if (pos < size) { retval = generic_file_direct_IO(READ, iocb, iov, pos, nr_segs); if (retval >= 0 && !is_sync_kiocb(iocb)) retval = -EIOCBQUEUED; if (retval > 0) *ppos = pos + retval; } file_accessed(filp); goto out; } retval = 0; //如果不是直接I/O模式的话,就用页高速缓存 if (count) { for (seg = 0; seg < nr_segs; seg++) { read_descriptor_t desc; //定义读操作描述符 desc.written = 0; desc.arg.buf = iov[seg].iov_base; //用户缓冲区 desc.count = iov[seg].iov_len; //读取长度 if (desc.count == 0) continue; desc.error = 0; do_generic_file_read(filp,ppos,&desc,file_read_actor); //调用该函数读文件 retval += desc.written; if (!retval) { retval = desc.error; break; } } }out: return retval;}EXPORT_SYMBOL(__generic_file_aio_read);
我们可以将上面这个函数粗略划分为三部分,检查部分,以及直接I/O读取,和页高速缓存读取,如果设置了O_DIRECT标志,则直接读取调用generic_file_direct_IO(),否则要使用页高速缓存,调用do_generic_file_read,,我们主要关注页高速缓存读取。
/linux-2.6.11.10/include/linux/fs.hstatic inline void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor){ do_generic_mapping_read(filp->f_mapping, &filp->f_ra, filp, ppos, desc, actor);}
do_generic_file_read会继续调用do_generic_mapping_read,这个调用表示对文件的读操作转换为对页高速缓存的读操作。
之所以要在I/O过程中加入页高速缓存这么一个缓冲层,是为了提高读取的效率,我们希望能尽量减少对磁盘的读取,而将读取放到内存中进行,所以引入页高速缓存这么一个中间层。
上面的参数中有一个filp->f_mapping,这个是一个地址空间变量,其定义如下。
/linux-2.6.11.10/include/linux/fs.hstruct address_space { struct inode *host; /* owner: inode, block_device */ struct radix_tree_root page_tree; /* radix tree of all pages */ spinlock_t tree_lock; /* and spinlock protecting it */ unsigned int i_mmap_writable;/* count VM_SHARED mappings */ struct prio_tree_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ spinlock_t i_mmap_lock; /* protect tree, count, list */ unsigned int truncate_count; /* Cover race condition with truncate */ unsigned long nrpages; /* number of total pages */ pgoff_t writeback_index;/* writeback starts here */ struct address_space_operations *a_ops; /* methods */ unsigned long flags; /* error bits/gfp mask */ struct backing_dev_info *backing_dev_info; /* device readahead, etc */ spinlock_t private_lock; /* for use by the address_space */ struct list_head private_list; /* ditto */ struct address_space *assoc_mapping; /* ditto */} __attribute__((aligned(sizeof(long))));
通过host和page_tree两个属性,一个adrees_space结构体可以将一个文件和属于它的缓存页联系起来,page_tree是struct radix_tree_root类型的,就是一颗树的根,它指向一颗基树,相应的页都存在叶子节点上,这样找页就很简单了。
在do_generic_mapping_read里,检查完基础数据后,会建立一个循环,这个循环每次读一页内容,直到读完所有内容。
首先是find_page,它会通过关联有页的基树找到相应的页,如果没找到,就跳到no_cached_page重新分配一个页插入到基树里去,如果为脏页则需要更新,如果既能找到,又不需要更新,那么直接page_ok将数据拷贝到用户态即可。
/linux-2.6.11.10/mm/filemap.c——do_generic_mapping_readfind_page: page = find_get_page(mapping, index); //首先在页高速缓存里寻找页描述符 if (unlikely(page == NULL)) { handle_ra_miss(mapping, &ra, index); goto no_cached_page; } if (!PageUptodate(page)) //检查是否为脏页 goto page_not_up_to_date;
找到页以后开始读页,主要的重点语句是这一句
linux-2.6.11.10/mm/filemap.c——do_generic_mapping_readreadpage: /* Start the actual read. The read will unlock the page. */ error = mapping->a_ops->readpage(filp, page); if (unlikely(error)) goto readpage_error; if (!PageUptodate(page)) { lock_page(page); if (!PageUptodate(page)) { if (page->mapping == NULL) { /* * invalidate_inode_pages got it */ unlock_page(page); page_cache_release(page); goto find_page; } unlock_page(page); error = -EIO; goto readpage_error; } unlock_page(page); } /* * i_size must be checked after we have done ->readpage. * * Checking i_size after the readpage allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ isize = i_size_read(inode); end_index = (isize - 1) >> PAGE_CACHE_SHIFT; if (unlikely(!isize || index > end_index)) { page_cache_release(page); goto out; } /* nr is the maximum number of bytes to copy from this page */ nr = PAGE_CACHE_SIZE; if (index == end_index) { nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; if (nr <= offset) { page_cache_release(page); goto out; } } nr = nr - offset; goto page_ok;
这里又出现了一个回调函数,又调用了相关文件系统的相关函数,我们可以找到ext2的a_ops定义如下:
/linux-2.6.11.10/fs/ext2/inode.cstruct address_space_operations ext2_aops = { .readpage = ext2_readpage, .readpages = ext2_readpages, .writepage = ext2_writepage, .sync_page = block_sync_page, .prepare_write = ext2_prepare_write, .commit_write = generic_commit_write, .bmap = ext2_bmap, .direct_IO = ext2_direct_IO, .writepages = ext2_writepages,};
再找到ext2_readpage开始我们的读页操作。
/linux-2.6.11.10/fs/ext2/inode.cstatic int ext2_readpage(struct file *file, struct page *page){ return mpage_readpage(page, ext2_get_block);}
这里它又继续调用了一个通用例程mapge_readpage,导入了页地址以及ext2的数据块寻址函数。
/linux-2.6.11.10/fs/mpage.cint mpage_readpage(struct page *page, get_block_t get_block){ struct bio *bio = NULL; sector_t last_block_in_bio = 0; bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio, get_block); if (bio) mpage_bio_submit(READ, bio); return 0;}EXPORT_SYMBOL(mpage_readpage);
这里就两步操作,申请一个struct bio对象,然后提交这个任务。bio是通用块层用来管理传输数据的,他把一个磁盘存储区和一块内存区域联系起来。
然后提交这个任务,这里面其实还有一个调度过程,所有的bio请求都在一个队列里,它可以重排读写数据块的请求,在重复访问文件同一个部分或多进程访问同一数据,可以大大提高读取效率。
最终,这件读操作会交给磁盘的设备驱动程序来进行真正的数据操作。
读完以后,再回到do_generic_mapping_read,跳到page_ok,它会调用__copy_to_user()函数将数据拷贝到用户态缓冲区,
linux-2.6.11.10/mm/filemap.c——do_generic_mapping_readpage_ok: /* If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ if (mapping_writably_mapped(mapping)) flush_dcache_page(page); /* * When (part of) the same page is read multiple times * in succession, only mark it as accessed the first time. */ if (prev_index != index) mark_page_accessed(page); prev_index = index; /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... * * The actor routine returns how many bytes were actually used.. * NOTE! This may not be the same as how much of a user buffer * we filled up (we may be padding etc), so we can only update * "pos" here (the actor routine has to update the user buffer * pointers and the remaining count). */ ret = actor(desc, page, offset, nr); offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; page_cache_release(page); if (ret == nr && desc->count) continue; goto out;
linux-2.6.11.10/mm/filemap.cint file_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset, unsigned long size){ char *kaddr; unsigned long left, count = desc->count; if (size > count) size = count; /* * Faults on the destination of a read are common, so do it before * taking the kmap. */ if (!fault_in_pages_writeable(desc->arg.buf, size)) { kaddr = kmap_atomic(page, KM_USER0); left = __copy_to_user_inatomic(desc->arg.buf, kaddr + offset, size); kunmap_atomic(kaddr, KM_USER0); if (left == 0) goto success; } /* Do it the slow way */ kaddr = kmap(page); left = __copy_to_user(desc->arg.buf, kaddr + offset, size); kunmap(page); if (left) { size -= left; desc->error = -EFAULT; }success: desc->count = count - size; desc->written += size; desc->arg.buf += size; return size;}
然后更新一些计数,再一步步往上返回到最开始的read()系统调用,调用就结束了。