File system

这个实验是关于文件系统的，要实现大文件存储和符号链接功能。

我们首先来看一下 xv6 文件系统的结构。

xv6 文件系统分层结构如下图所示，Disk 层负责读写硬件设备，Buffer cache 层缓存磁盘块，负责同步缓存的更改到磁盘，确保在某一时刻只有一个进程对特定的磁盘块更改。Logging 层确保上层能对磁盘块进行事务操作，例如对几个磁盘块的更改要么都成功要么都失败。Inode 层提供了一个个独立的文件，每个文件都被表示为一个 inode, inode 拥有唯一编号以及存储文件内容的数据块。Pathname 层提供了对文件按层级递归访问的能力，File descriptor 层将 unix 资源如：管道、设备、文件等抽象为文件系统接口。

文件系统必须规划好磁盘哪里存 inodes 哪里存数据块，因此 xv6 将磁盘分成了下图所示的部分，block 0 为启动块，文件系统不会使用 block 0 ，block 1 称为超级块，它包含了文件系统的元信息，例如：文件系统中每个块的大小，有多少个块，inodes 的个数，log 有多少个块。从 block 2 开始存储 log，log 后面是存储 inodes，一个块里会存储多个 inodes，接下来是 bitmap 记录了哪些数据块是已经被使用了的，最后是数据块，要么是 bitmap 标记为空闲的数据块，或者是存储了文件和目录的内容的数据块。

Large files

在这个练习中，将增加 xv6 文件的最大大小。目前 xv6 文件限制为 268 个块，或 268*BSIZE 字节（在 xv6 中 BSIZE 为 1024）。这个限制来自这样一个事实，一个 xv6 inode 包含 12 个直接块号和一个间接块号，一个块最多可以容纳 256 个块号，总共 12+256=268 块。

我们要做的就是实现两级间接块号来使一个 inode 能存储更大的文件。

实现：

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122


#define NDIRECT 11
#define ONEINDIRECT (BSIZE / sizeof(uint))
#define NINDIRECT (ONEINDIRECT * ONEINDIRECT + ONEINDIRECT)
#define MAXFILE (NDIRECT + NINDIRECT)

// in-memory copy of an inode
struct inode {
  uint dev;           // Device number
  uint inum;          // Inode number
  int ref;            // Reference count
  struct sleeplock lock; // protects everything below here
  int valid;          // inode has been read from disk?

  short type;         // copy of disk inode
  short major;
  short minor;
  short nlink;
  uint size;
  uint addrs[NDIRECT+2]; // 增加一个间接块号的地址
};

static uint
bmap(struct inode *ip, uint bn)
{
  uint addr, *a;
  struct buf *bp;

  if(bn < NDIRECT){
    if((addr = ip->addrs[bn]) == 0)
      ip->addrs[bn] = addr = balloc(ip->dev);
    return addr;
  }
  bn -= NDIRECT;

  if(bn < NINDIRECT){
    if(bn < ONEINDIRECT){
      // Load indirect block, allocating if necessary.
      if((addr = ip->addrs[NDIRECT]) == 0)
        ip->addrs[NDIRECT] = addr = balloc(ip->dev);
      bp = bread(ip->dev, addr);
      a = (uint*)bp->data;
      if((addr = a[bn]) == 0){
        a[bn] = addr = balloc(ip->dev);
        log_write(bp);
      }
      brelse(bp);
      return addr;
    } else {
      bn -= ONEINDIRECT;
      if((addr = ip->addrs[NDIRECT+1]) == 0)
        ip->addrs[NDIRECT+1] = addr = balloc(ip->dev);
      bp = bread(ip->dev, addr);
      a = (uint*)bp->data;
      if((addr = a[bn/ONEINDIRECT]) == 0) {
        a[bn/ONEINDIRECT] = addr = balloc(ip->dev);
        log_write(bp);
      }
      brelse(bp);
      bp = bread(ip->dev, addr);
      a = (uint*)bp->data;
      if((addr = a[bn%ONEINDIRECT]) == 0) {
        a[bn%ONEINDIRECT] = addr = balloc(ip->dev);
        log_write(bp);
      }
      brelse(bp);
      return addr;
    }
  }

  panic("bmap: out of range");
}

void
itrunc(struct inode *ip)
{
  int i, j, k;
  struct buf *bp, *ibp;
  uint *a, *b;

  for(i = 0; i < NDIRECT; i++){
    if(ip->addrs[i]){
      bfree(ip->dev, ip->addrs[i]);
      ip->addrs[i] = 0;
    }
  }

  if(ip->addrs[NDIRECT]){
    bp = bread(ip->dev, ip->addrs[NDIRECT]);
    a = (uint*)bp->data;
    for(j = 0; j < ONEINDIRECT; j++){
      if(a[j])
        bfree(ip->dev, a[j]);
    }
    brelse(bp);
    bfree(ip->dev, ip->addrs[NDIRECT]);
    ip->addrs[NDIRECT] = 0;
  }

  if(ip->addrs[NDIRECT+1]){
    bp = bread(ip->dev, ip->addrs[NDIRECT+1]);
    a = (uint*)bp->data;
    for(j = 0; j < ONEINDIRECT; j++){
      if(a[j]) {
        ibp = bread(ip->dev, a[j]);
        b = (uint*)ibp->data;
        for(k = 0; k < ONEINDIRECT; k++) {
          if(b[k]){
            bfree(ip->dev, b[k]);
          }
        }
        brelse(ibp);
        bfree(ip->dev, a[j]);
      }
    }
    brelse(bp);
    bfree(ip->dev, ip->addrs[NDIRECT+1]);
    ip->addrs[NDIRECT+1] = 0;
  }

  ip->size = 0;
  iupdate(ip);
}

Symbolic links

实现符号链接。符号链接（或软链接）是指通过路径名链接的文件；当一个符号链接被打开时，内核会跟随链接指向被引用的文件。符号链接类似于硬链接，但硬链接仅限于指向同一磁盘上的文件，而符号链接可以跨磁盘设备。

实现，基本上按照提示走就可以了。

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140


uint64
sys_symlink(void) {
  char target[MAXPATH], path[MAXPATH];
  struct inode *dp, *ip;
  if(argstr(0, target, MAXPATH) < 0 || argstr(1, path, MAXPATH) < 0)
    return -1;

  begin_op();
  if((ip = namei(target)) != 0){
    ilock(ip);
    if(ip->type == T_DIR){
      iunlockput(ip);
      end_op();
      return -1;
    }
    iunlockput(ip);
  }

  if((dp = create(path, T_SYMLINK, 0, 0)) == 0){
    end_op();
    return -1;
  }

  uint len = strlen(target) + 1;
  if(writei(dp, 0, (uint64)target, 0, len) != len){
    iunlockput(dp);
    end_op();
    return -1;
  }

  iupdate(dp);
  iunlockput(dp);
  end_op();

  return 0;
}

struct inode*
find_symlink(char *path, char *rpath, int depth)
{
  if(depth>= 10)
    return 0;

  struct inode *ip;
  if((ip = namei(path)) != 0){
    ilock(ip);

    if(ip->type != T_SYMLINK){
      iunlock(ip);
      return ip;
    }

    if(readi(ip, 0, (uint64)rpath, 0, ip->size) == 0){
      iunlockput(ip);
      return 0;
    }

    iunlockput(ip);

    return find_symlink(rpath, rpath, depth + 1);
  }

  return 0;
}

uint64
sys_open(void)
{
  char path[MAXPATH];
  int fd, omode;
  struct file *f;
  struct inode *ip;
  int n;

  if((n = argstr(0, path, MAXPATH)) < 0 || argint(1, &omode) < 0)
    return -1;

  begin_op();

  if(omode & O_CREATE){
    ip = create(path, T_FILE, 0, 0);
    if(ip == 0){
      end_op();
      return -1;
    }
  } else {
     if(omode & O_NOFOLLOW){
      if((ip = namei(path)) == 0){
        end_op();
        return -1;
      }
    } else{
      char rpath[MAXPATH];
      if((ip = find_symlink(path, rpath, 0)) == 0){
        end_op();
        return -1;
      }
    }
    ilock(ip);
    if(ip->type == T_DIR && omode != O_RDONLY){
      iunlockput(ip);
      end_op();
      return -1;
    }
  }

  if(ip->type == T_DEVICE && (ip->major < 0 || ip->major >= NDEV)){
    iunlockput(ip);
    end_op();
    return -1;
  }

  if((f = filealloc()) == 0 || (fd = fdalloc(f)) < 0){
    if(f)
      fileclose(f);
    iunlockput(ip);
    end_op();
    return -1;
  }

  if(ip->type == T_DEVICE){
    f->type = FD_DEVICE;
    f->major = ip->major;
  } else {
    f->type = FD_INODE;
    f->off = 0;
  }
  f->ip = ip;
  f->readable = !(omode & O_WRONLY);
  f->writable = (omode & O_WRONLY) || (omode & O_RDWR);

  if((omode & O_TRUNC) && ip->type == T_FILE){
    itrunc(ip);
  }

  iunlock(ip);
  end_op();

  return fd;
}

总结

这个实验让我熟悉了 xv6 文件系统的结构，以及文件存储，访问在内核中是如何进行的。实验难度来讲是比较容易的，主要是加深对文件系统的理解。这里还有一个插曲是跑测试的过程中发现了 docker volume 的性能很差，因为我用的是将代码放在本地编写修改，通过 docker volume 的方式映射到一个 ubuntu 的容器内跑测试，结果发现测试一直跑不过，都是因为超时退出，但是在一个一个测试跑又都能跑过，就开始怀疑是不是 docker 环境的问题，结果网上一查确实有这个问题，就写了测试代码试了一下，发现确实是环境的问题。无奈又在 mac 上装了一套环境，顺利跑过测试。

Contents

Large files

Symbolic links

总结