System calls

这是 6.s081 的第二个实验，使用系统调用来写一些工具，从而帮助你更好的了解系统调用是如何工作的。

RISC-V 调用规约（calling convention)

为了更好的理解系统调用的过程，这里需要了解一下 risc-v 的调用规约，即 risc-v 在进行函数调用时，调用者和被调用者需要遵循的一种约定，首先来看一下调用过程中寄存器的使用。

寄存器名	ABI 名（编程用名）	用途约定	谁负责在函数调用过程中维护这些寄存器
x0	zero	读取时总为 0，写入时无效	N/A
x1	ra	存放函数返回的地址（return address）	Caller
x2	sp	存放栈指针（stack pointer）	Callee
x5-x7,x28-x31	t0-t2,t3-t6	临时（temporaries) 寄存器，Callee 可能会使用这些寄存器，所以 Callee 不保证这些寄存器中的值在函数调用过程中保存不变，这意味着对于 Caller 来说，如果需要的话，Caller 需要自己在调用 Callee 之前保存临时寄存器中的值。	Caller
x8,x9, x18-x27	s0,s1,s2-s11	保存（saved）寄存器，Callee 需要保证这些寄存器的值在函数返回后仍然维持函数调用之前的原值，所以一旦 Callee 在自己的函数中会用到这些寄存器则需要在栈中备份并在退出函数时进行恢复。	Callee
x10, x11	a0,a1	参数（argument）寄存器，用于函数调用过程中保存第一个和第二个参数，以及在函数返回是传递返回值。	Caller
x12-x17	a2-a7	参数（argument）寄存器，如果函数调用时需要传递更多的参数，则可以用这些寄存器，但注意用于传递参数的寄存器最多只有 8 个（a0-a7），如果还有更多的参数则要利用栈。	Caller

接下来看一下调用过程常用的一些指令。

伪指令	等价指令	描述	例子
`jal offset`	`jal x1, offset`	跳转到 offset 指定位置，返回地址保存在 x1(ra)	`jal foo`
`jalr rs`	`jalr x1, 0(rs)`	跳转到 rs 中值指定的位置，返回地址保存在 x1(ra)	`jalr s1`
`j offset`	`jal x0, offset`	跳转到 offset 指定位置，不保存返回地址	`j loop`
`jr rs`	`jalr x0, 0(rs)`	跳转到 rs 值指定位置，不保存返回地址	`jr s1`
`call offset`	`auipc x1,offset[31:12] + offset[11];jalr x1 offset[11:0](x1)`	长跳转调用函数	`call foo`
`tail offset`	`auipc x6,offset[31:12] + offset[11];jalr x0 offset[11:0](x6)`	长跳转尾调用	`tail foo`
`ret`	`jalr x0, 0(x1)`	从 Callee 返回	`ret`

函数调用时的栈帧。

看一个具体函数调用的例子。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10


int g(int x) {
   0:	1141                	addi	sp,sp,-16 // 开辟栈空间
   2:	e422                	sd	s0,8(sp)    // 将 s0 的值写到 sp 偏移 8 字节的位置，s0 存的就是 fp
   4:	0800                	addi	s0,sp,16  // 将 sp 存到 s0
  return x+3;
}
   6:	250d                	addiw	a0,a0,3   //a0 里保存了参数的值 x, 这里 a0 = x + 3
   8:	6422                	ld	s0,8(sp)    // 将 sp 偏移 8 字节的位置的值写到 s0, 这里是还原 s0
   a:	0141                	addi	sp,sp,16  // 还原 sp
   c:	8082                	r               // 返回

这个调用例子里 ra 即 return address 并没有压进栈里，猜测是编译器认为在这个函数里不会调用其他函数，所以 ra 寄存器的值不会变，可以直接读取 ra 里的值作为返回地址。

下面这个例子里 ra 是压进栈里的。

1
2
3
4
5
6


void main(void) {
  1c:	1141                	addi	sp,sp,-16
  1e:	e406                	sd	ra,8(sp)
  20:	e022                	sd	s0,0(sp)
  22:	0800                	addi	s0,sp,16
  printf("%d %d\n", f(8)+1, 13);

系统调用流程

从启动一个进程调用 exec 系统调用开始来了解一个整个系统调用的流程。首先 exec 是在 (user/initcode.S:7) 开始的。

1
2
3
4
5
6
7


# exec(init, argv)
.globl start
start:
        la a0, init
        la a1, argv
        li a7, SYS_exec
        ecall

a0, a1 存放了系统的调用的参数，a7 存放了系统调用号，对应 (kernel/syscall.c:108) 中的定义。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23


static uint64 (*syscalls[])(void) = {
[SYS_fork]    sys_fork,
[SYS_exit]    sys_exit,
[SYS_wait]    sys_wait,
[SYS_pipe]    sys_pipe,
[SYS_read]    sys_read,
[SYS_kill]    sys_kill,
[SYS_exec]    sys_exec,
[SYS_fstat]   sys_fstat,
[SYS_chdir]   sys_chdir,
[SYS_dup]     sys_dup,
[SYS_getpid]  sys_getpid,
[SYS_sbrk]    sys_sbrk,
[SYS_sleep]   sys_sleep,
[SYS_uptime]  sys_uptime,
[SYS_open]    sys_open,
[SYS_write]   sys_write,
[SYS_mknod]   sys_mknod,
[SYS_unlink]  sys_unlink,
[SYS_link]    sys_link,
[SYS_mkdir]   sys_mkdir,
[SYS_close]   sys_close,
};

ecall 指令陷入内核执行 uservec, usertrap，然后执行 syscall。(kernel/trapoline.S:16)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70


uservec:
	#
        # trap.c sets stvec to point here, so
        # traps from user space start here,
        # in supervisor mode, but with a
        # user page table.
        #
        # sscratch points to where the process's p->trapframe is
        # mapped into user space, at TRAPFRAME.
        #

	# swap a0 and sscratch
        # so that a0 is TRAPFRAME
        csrrw a0, sscratch, a0

        # save the user registers in TRAPFRAME
        sd ra, 40(a0)
        sd sp, 48(a0)
        sd gp, 56(a0)
        sd tp, 64(a0)
        sd t0, 72(a0)
        sd t1, 80(a0)
        sd t2, 88(a0)
        sd s0, 96(a0)
        sd s1, 104(a0)
        sd a1, 120(a0)
        sd a2, 128(a0)
        sd a3, 136(a0)
        sd a4, 144(a0)
        sd a5, 152(a0)
        sd a6, 160(a0)
        sd a7, 168(a0)
        sd s2, 176(a0)
        sd s3, 184(a0)
        sd s4, 192(a0)
        sd s5, 200(a0)
        sd s6, 208(a0)
        sd s7, 216(a0)
        sd s8, 224(a0)
        sd s9, 232(a0)
        sd s10, 240(a0)
        sd s11, 248(a0)
        sd t3, 256(a0)
        sd t4, 264(a0)
        sd t5, 272(a0)
        sd t6, 280(a0)

	# save the user a0 in p->trapframe->a0
        csrr t0, sscratch
        sd t0, 112(a0)

        # restore kernel stack pointer from p->trapframe->kernel_sp
        ld sp, 8(a0)

        # make tp hold the current hartid, from p->trapframe->kernel_hartid
        ld tp, 32(a0)

        # load the address of usertrap(), p->trapframe->kernel_trap
        ld t0, 16(a0)

        # restore kernel page table from p->trapframe->kernel_satp
        ld t1, 0(a0)
        csrw satp, t1
        sfence.vma zero, zero

        # a0 is no longer valid, since the kernel page
        # table does not specially map p->tf.

        # jump to usertrap(), which does not return
        jr t0

(kernel/trap.c:37)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53


//
// handle an interrupt, exception, or system call from user space.
// called from trampoline.S
//
void
usertrap(void)
{
  int which_dev = 0;

  if((r_sstatus() & SSTATUS_SPP) != 0)
    panic("usertrap: not from user mode");

  // send interrupts and exceptions to kerneltrap(),
  // since we're now in the kernel.
  w_stvec((uint64)kernelvec);

  struct proc *p = myproc();

  // save user program counter.
  p->trapframe->epc = r_sepc();

  if(r_scause() == 8){
    // system call

    if(p->killed)
      exit(-1);

    // sepc points to the ecall instruction,
    // but we want to return to the next instruction.
    p->trapframe->epc += 4;

    // an interrupt will change sstatus &c registers,
    // so don't enable until done with those registers.
    intr_on();

    syscall();
  } else if((which_dev = devintr()) != 0){
    // ok
  } else {
    printf("usertrap(): unexpected scause %p pid=%d\n", r_scause(), p->pid);
    printf("sepc=%p stval=%p\n", r_sepc(), r_stval());
    p->killed = 1;
  }

  if(p->killed)
    exit(-1);

  // give up the CPU if this is a timer interrupt.
  if(which_dev == 2)
    yield();

  usertrapret();
}

(kernel/syscall.c:133)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15


void
syscall(void)
{
  int num;
  struct proc *p = myproc();

  num = p->trapframe->a7;
  if(num> 0 && num < NELEM(syscalls) && syscalls[num]) {
    p->trapframe->a0 = syscalls[num]();
  } else {
    printf("%d %s: unknown sys call %d\n",
            p->pid, p->name, num);
    p->trapframe->a0 = -1;
  }
}

系统调用的返回值会写到 p->trapframe->a0，当 exec() 返回时通过 a0 读到系统调用的返回值，系统调用通常用负数返回值来表示错误，0 表示成功。

上面简单梳理了系统调用的流程，梳理清楚这些就可以做这个 lab2 了，当然这里还有些细节，比如参数如何从用户态传递到内核态，内核怎么通过用户态传入的地址寻址等等这些要等后面虚拟内存和中断两个实验做完才知道。

System call tracing

这里要实现一个新的系统调用 trace 用来跟踪调用了哪个系统调用，trace 系统调用传入一个参数 mask 来指定需要跟踪哪个系统调用，trace 系统调用要打印出系统调用的名称和返回值以及进程号。

根据实验提示首先在 Makefile 里把 $U/_trace 加到 UPROGS。

然后增加 trace 系统调用，_trace 进程会调用 sys_trace 系统调用。

1
2
3
4
5
6
7
8
9


uint64
sys_trace(void)
{
  int mask;
  if(argint(0, &mask) < 0)
    return -1;
  myproc()->tmask = mask;
  return 0;
}

为了保存 mask 参数需要在进程结构体中加入一个变量来记录。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22


struct proc {
  struct spinlock lock;

  // p->lock must be held when using these:
  enum procstate state;        // Process state
  struct proc *parent;         // Parent process
  void *chan;                  // If non-zero, sleeping on chan
  int killed;                  // If non-zero, have been killed
  int xstate;                  // Exit status to be returned to parent's wait
  int pid;                     // Process ID
  int tmask;                    // Trace mask

  // these are private to the process, so p->lock need not be held.
  uint64 kstack;               // Virtual address of kernel stack
  uint64 sz;                   // Size of process memory (bytes)
  pagetable_t pagetable;       // User page table
  struct trapframe *trapframe; // data page for trampoline.S
  struct context context;      // swtch() here to run process
  struct file *ofile[NOFILE];  // Open files
  struct inode *cwd;           // Current directory
  char name[16];               // Process name (debugging)
};

同时不要忘了在 fork 子进程是将参数复制到子进程

1

np->tmask = p->tmask;

最后在系统调用的路口判断，然后打印需要的信息就可以了

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18


void
syscall(void)
{
  int num;
  struct proc *p = myproc();

  num = p->trapframe->a7;
  if(num> 0 && num < NELEM(syscalls) && syscalls[num]) {
    p->trapframe->a0 = syscalls[num]();
    if ((p->tmask >> num) & 1) {
      printf("%d: syscall %s -> %d \n", p->pid, syscalls_name[num], p->trapframe->a0);
    }
  } else {
    printf("%d %s: unknown sys call %d\n",
            p->pid, p->name, num);
    p->trapframe->a0 = -1;
  }
}

Sysinfo

这里要实现一个系统调用 sysinfo，参数是一个 sysinfo 结构体，用来收集系统信息，包括有多少空闲内存，正在运行的进程数。

根据实验提示首先在 Makefile 里把 $U/_sysinfotest 加到 UPROGS。

声明 sysinfo 系统调用

1
2


struct sysinfo;
int sysinfo(struct sysinfo *);

sysinfo 需要将 struct sysinfo 拷贝回用户空间，这里可以参考 sys_fstat()

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17


uint64
sys_info(void)
{
  struct proc *p = myproc();
  uint64 addr;

  if(argaddr(0, &addr) < 0)
    return -1;

  struct sysinfo si;
  si.freemem = get_free_memory();
  si.nproc = get_number_of_processes();

  if(copyout(p->pagetable, addr, (char *)&si, sizeof(si)) < 0)
    return -1;
  return 0;
}

分别实现 get_free_memory() 和 get_free_memory()。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11


uint64 get_free_memory() {
  uint64 f = 0;
  struct run *r;

  r = kmem.freelist;
  while (r) {
    f += PGSIZE;
    r = r->next;
  }
  return f;
}

 1
 2
 3
 4
 5
 6
 7
 8
 9
10


uint64 get_number_of_processes() {
  uint64 res = 0;
  struct proc *p;

  for(p = proc; p < &proc[NPROC]; p++) {
      if (p->state != UNUSED) res++;
  }

  return res;
}

总结

通过这个实验对系统调用是如何工作的有了更深的了解，同时也是学习了下 rsic-v，对 rsic-v 有了些了解。

Contents

RISC-V 调用规约 （calling convention)

系统调用流程

System call tracing

Sysinfo

总结

RISC-V 调用规约（calling convention)