Linux文件系统之文件的读写.doc
Linux文件系统之文件的读写(续二) - 本文系本站原创,欢迎转载!转载请注明出处:-八:VFS层的I/O操作VFS层是与用户界面直接交互的接口,在这一节里,我们将分为读写两部份来介绍VFS层的操作以及跟上层用用的交互.8.1:文件的读操作在用户空间,读文件操作的常用函数为read()。对应在系统空间的调用入口是sys_read().它的代码如下:asmlinkage ssize_t sys_read(unsigned int fd, char _user * buf, size_t count) struct file *file; ssize_t ret = -EBADF; int fput_needed; /根据fd从进程中取出相应的file对象 file = fget_light(fd, &fput_needed); if (file) loff_t pos = file_pos_read(file); /文件的当前位置 ret = vfs_read(file, buf, count, &pos); /更新当前的文件位置 file_pos_write(file, pos); fput_light(file, fput_needed); return ret;从进程中取得文件描述符后和文件当前的操作位置后会调用vfs_read()执行具体的操作过程.它的代码如下:ssize_t vfs_read(struct file *file, char _user *buf, size_t count, loff_t *pos) struct inode *inode = file->f_dentry->d_inode; ssize_t ret; if (!(file->f_mode & FMODE_READ) return -EBADF; if (!file->f_op | (!file->f_op->read && !file->f_op->aio_read) return -EINVAL;/检查当前区段是否允许读操作 ret = locks_verify_area(FLOCK_VERIFY_READ, inode, file, *pos, count); if (!ret) /是否有权限 ret = security_file_permission (file, MAY_READ); if (!ret) /如果有read 操作,调用之 if (file->f_op->read) ret = file->f_op->read(file, buf, count, pos); else /否则调用aio_read ret = do_sync_read(file, buf, count, pos); /ret: 写入的字节数 if (ret > 0) /产生通告 dnotify_parent(file->f_dentry, DN_ACCESS); return ret;从上面看到,会最终调用file的相关操作完成文件的读操作.曾记得我们在文件的打开一节中分析了文件的打开过程。在打开文件过程中,文件描述符的相关操作会被赋值为inode->f_op.对于ext2文件系统,inode的相关信息如下: inode->i_fop = &ext2_file_operations;struct file_operations ext2_file_operations = .llseek = generic_file_llseek, .read = generic_file_read, .write = generic_file_write, .aio_read = generic_file_aio_read, .aio_write = generic_file_aio_write, .ioctl = ext2_ioctl, .mmap = generic_file_mmap, .open = generic_file_open, .release = ext2_release_file, .fsync = ext2_sync_file, .readv = generic_file_readv, .writev = generic_file_writev, .sendfile = generic_file_sendfile,相应文件读操作入口为generic_file_read():ssize_tgeneric_file_read(struct file *filp, char _user *buf, size_t count, loff_t *ppos) /用户空间的地址和长度 struct iovec local_iov = .iov_base = buf, .iov_len = count ; /记录完成状态 struct kiocb kiocb; ssize_t ret; /kiocb.ki_key=KIOCB_SYNC_KEY; kiocb.ki_filp=filp;kiocb.ki_obj=current; init_sync_kiocb(&kiocb, filp); /返回读写完成的字节数 ret = _generic_file_aio_read(&kiocb, &local_iov, 1, ppos); /异步操作,需用等待 if (-EIOCBQUEUED = ret) ret = wait_on_sync_kiocb(&kiocb); /返回完成的字节数 return ret;_generic_file_aio_read()是一个很重要的函数,它是读操作的入口。代码如下:ssize_t_generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) struct file *filp = iocb->ki_filp; ssize_t retval; unsigned long seg; size_t count; count = 0; for (seg = 0; seg < nr_segs; seg+) const struct iovec *iv = &iovseg; /* * If any segment has a negative length, or the cumulative * length ever wraps negative then return -EINVAL. */ count += iv->iov_len; if (unlikely(ssize_t)(count|iv->iov_len) < 0) return -EINVAL; /检查从 iv->iov_base 开始的iov_len区间的合法性 if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len) continue; if (seg = 0) return -EFAULT; /nr_seg: 有效的数据段数目 nr_segs = seg; /上一个数据段无效,将其长度减下来 count -= iv->iov_len; /* This segment is no good */ break; /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ /如果定义了O_DIRECT:直接传送数据绕过了页高速缓存 if (filp->f_flags & O_DIRECT) loff_t pos = *ppos, size; struct address_space *mapping; struct inode *inode; mapping = filp->f_mapping; inode = mapping->host; retval = 0; if (!count) goto out; /* skip atime */ size = i_size_read(inode); if (pos < size) retval = generic_file_direct_IO(READ, iocb, iov, pos, nr_segs); if (retval >= 0 && !is_sync_kiocb(iocb) retval = -EIOCBQUEUED; if (retval > 0) *ppos = pos + retval; file_accessed(filp); goto out; /count:读取文件的长度 retval = 0; if (count) for (seg = 0; seg < nr_segs; seg+) /read_descriptor_t:读操作描述符用来记录读的状态 read_descriptor_t desc; desc.written = 0; desc.arg.buf = iovseg.iov_base; desc.count = iovseg.iov_len; /如果没有要传输的数据继续下一个iov if (desc.count = 0) continue; desc.error = 0; /对其中的每一个段调用do_generic_file_read do_generic_file_read(filp,ppos,&desc,file_read_actor,0); /desc.written:写入到用户空间的字节数 /更新retval retval += desc.written; if (!retval) retval = desc.error; break; out: return retval;这里有种特殊情况,当文件是用直接I/O模式打开时(文件描述符带有O_DIRECT标志),就会采用直接I/O而跳过了页高速缓区。这样的情况我们在之后再讨论.对于普通模块的情况。将会对每一个段调用do_generic_file_read()来完成I/O操作。这个函数的代码如下:do_generic_file_read()à do_generic_file_read():/* mapping: 页高速缓存区 _ra: filep对应的file_ra_state filep: 打开的文件描述符 ppos: 当前的操作位置 desc: 读操作描述符 actor: 内核空间到用户空间的拷贝函数 nonblock: 如果此变量为1,则需要预读 */void do_generic_mapping_read(struct address_space *mapping, struct file_ra_state *_ra, struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor, int nonblock) struct inode *inode = mapping->host; unsigned long index, end_index, offset; loff_t isize; struct page *cached_page; int error; struct file_ra_state ra = *_ra; cached_page = NULL; /找到页面的偏移量。即确定是存储在那个存面中 index = *ppos >> PAGE_CACHE_SHIFT; /第一个请求字节在页面的偏移量 /亦即请求的字节在页面中的偏移 offset = *ppos & PAGE_CACHE_MASK; /inode对应的文件大小 isize = i_size_read(inode); if (!isize) goto out; /最后的缓存页序号 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; for (;) struct page *page; unsigned long nr, ret; /* nr is the maximum number of bytes to copy from this page */ /nr: 缓存页空间大小 nr = PAGE_CACHE_SIZE; if (index >= end_index) /index > end_indx: 肯定是非法的页面缓存器大小 if (index > end_index) goto out; /执行到这里,肯定有index = end_index /nr转化成了文件在最后一个缓存page中的位置 nr = (isize - 1) & PAGE_CACHE_MASK) + 1; /offset是当前位置在页中的偏移,nr: 是最后一个块在磁盘中的偏移 /如果nr<=offset说明文件已经操作完了 if (nr <= offset) goto out; /nr-offset: 页面的剩余操作字节数 nr = nr - offset; /检查当前进程是否设置了重新调度标志如果有调用schdule()重新调度一次 cond_resched(); /文件预读 if (!nonblock) page_cache_readahead(mapping, &ra, filp, index); find_page: /寻找当前位置对应的缓存页 page = find_get_page(mapping, index); if (unlikely(page = NULL) /没有找到对应的缓存页,说明在页缓存区中不存在此页面对应的缓存页 if (nonblock) desc->error = -EWOULDBLOCKIO; break; handle_ra_miss(mapping, &ra, index); goto no_cached_page; /在页缓存区中找到了相关的页面 /检查PG_uptodata标志是否被设置如果这个标志被设置的话,就不需要从设备 /上去读取了 if (!PageUptodate(page) /页面没有设置PG_uptodata页面中的内容无效,所以要从文件系统中把数据读取出来 if (nonblock) page_cache_release(page); desc->error = -EWOULDBLOCKIO; break; goto page_not_up_to_date; page_ok: /* If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ if (mapping_writably_mapped(mapping) flush_dcache_page(page); /* * Mark the page accessed if we read the beginning. */ if (!offset) mark_page_accessed(page); /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space. * * The actor routine returns how many bytes were actually used. * NOTE! This may not be the same as how much of a user buffer * we filled up (we may be padding etc), so we can only update * "pos" here (the actor routine has to update the user buffer * pointers and the remaining count). */ /页面与用户空间的值拷贝.返回拷贝的数据数 ret = actor(desc, page, offset, nr); offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= PAGE_CACHE_MASK; page_cache_release(page); /如果ret = nr: 拷贝的长度等于在页面中的剩余长度,说明拷贝没有发生错误 if (ret = nr && desc->count) continue; /否则,可以退出了 goto out; page_not_up_to_date: /* Get exclusive access to the page . */ /要从文件系统中传数据到此页面上。将此页面锁定 lock_page(page); /* Did it get unhashed before we got the lock? */ /有可能在锁页面的时候有其它的进程将页面移除了页缓存区 /在这种情况下:将page解锁并减少它的使用计数,重新循环 /重新进入循环后,在页缓存区找不到对应的page.就会重新分配一个新的page if (!page->mapping) unlock_page(page); page_cache_release(page); continue; /* Did somebody else fill it already? */ /在加锁的时候,有其它的进程完成了从文件系统到具体页面的映射? /在这种情况下,返回到page_ok.直接将页面上的内容copy到用户空间即可 if (PageUptodate(page) unlock_page(page); goto page_ok; /读取页面readpage: /* Start the actual read. The read will unlock the page. */ /到这里的话,实际的读取过程开始了 _ error = mapping->a_ops->readpage(filp, page); /读取错误,退出 if (unlikely(error) goto readpage_error; /如果PG_uptodata标志仍然末设置.就一直等待,一直到page不处于锁定状态 / TODO: 在将文件系统的内容读入page之前,page一直是处理Lock状态的。一直到 /读取完成后,才会将页面解锁. 然后将进程唤醒 if (!PageUptodate(page) wait_on_page_locked(page); /如果页面仍然没有PG_uptodata标志.只可能是发生了错误.出错返回 if (!PageUptodate(page) error = -EIO; goto readpage_error; /* * i_size must be checked after we have done ->readpage. * * Checking i_size after the readpage allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ isize = i_size_read(inode); end_index = (isize - 1) >> PAGE_CACHE_SHIFT; /如果文件大小无效或者当前位置超过了文件大小 if (unlikely(!isize | index > end_index) page_cache_release(page); goto out; /* nr is the maximum number of bytes to copy from this page */ /重新计算nr 即在页面中剩余的要copy的字节数 nr = PAGE_CACHE_SIZE; if (index = end_index) nr = (isize - 1) & PAGE_CACHE_MASK) + 1; if (nr <= offset) page_cache_release(page); goto out; nr = nr - offset; goto page_ok; readpage_error: /* UHHUH! A synchronous read error occurred. Report it */ desc->error = error; page_cache_release(page); goto out; no_cached_page: /* * Ok, it wasn't cached, so we need to create a new * page. */ /在页缓区中没有相关的缓存页 /新分匹一个页面 if (!cached_page) cached_page = page_cache_alloc_cold(mapping); if (!cached_page) desc->error = -ENOMEM; goto out; /将分得的页加到页缓存区和LRU / TODO:在将新页面插入页缓存区域中,会将页面标志设置为PG_locked error = add_to_page_cache_lru(cached_page, mapping, index, GFP_KERNEL); if (error) if (error = -EEXIST) goto find_page; desc->error = error; goto out; page = cached_page; cached_page = NULL; goto readpage; out: *_ra = ra; /ppos: 最后的读取位置 *ppos = (loff_t) index << PAGE_CACHE_SHIFT) + offset; if (cached_page) page_cache_release(cached_page); if (filp) file_accessed(filp);如果参数为nonblock为1,则必须预读页面。在这里的调用nonblock为零,不需要考虑预读的情况。关于预读的操作,我们之后再给出分析.在这个操作中,有这样几种可能的情况:1:如果要访问的页面在页高速缓存中,而且已经被更新(含有PG_uptodata标志).只需要直接将其copy到用户空间即可.2:序号对应的页面不在高速缓存中,那就需要在页高速缓存中增加序号对应的页面。然后从文件系统中读取数据到这个页面上.再拷贝到用户空间。3:序号对应的页面在高速缓存中,但数据不是最新的.这就需要缓存页与文件系统进行同步.再将页面拷贝到用户空间.对于2和3。它们有一部份是相同的,即从文件系统中读数据的过程。我们只需要分种对于第2的情况。对应的代码片段如下:void do_generic_mapping_read(struct address_space *mapping, struct file_ra_state *_ra, struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor, int nonblock) page = find_get_page(mapping, index); if (unlikely(page = NULL) /没有找到对应的缓存页,说明在页缓存区中不存在此页面对应的缓存页 if (nonblock) desc->error = -EWOULDBLOCKIO;