syzkaller 源码阅读笔记 —— syz-manager

syzkaller 中的 syz-manager 负责管理虚拟机,监控 crash,以及复现 crash

main

从 main 函数开始,解析参数,加载 config 文件,随后调用 RunManager

// syz-manager/manager.go

func main() {
	if prog.GitRevision == "" {
		log.Fatalf("bad syz-manager build: build with make, run bin/syz-manager")
	}
	flag.Parse()
	log.EnableLogCaching(1000, 1<<20)
	cfg, err := mgrconfig.LoadFile(*flagConfig)
	if err != nil {
		log.Fatalf("%v", err)
	}
	RunManager(cfg)
}

RunManager

// syz-manager/manager.go

func RunManager(cfg *mgrconfig.Config) {
	var vmPool *vm.Pool
	// Type "none" is a special case for debugging/development when manager
	// does not start any VMs, but instead you start them manually
	// and start syz-fuzzer there.
	if cfg.Type != "none" {
		var err error
		vmPool, err = vm.Create(cfg, *flagDebug) // 「1」
		if err != nil {
			log.Fatalf("%v", err)
		}
	}

	crashdir := filepath.Join(cfg.Workdir, "crashes")
	osutil.MkdirAll(crashdir)

	reporter, err := report.NewReporter(cfg)
	if err != nil {
		log.Fatalf("%v", err)
	}

	mgr := &Manager{
		cfg:              cfg,
		vmPool:           vmPool,
		//...
	}

	mgr.preloadCorpus()
	mgr.initStats() // Initializes prometheus variables.
	mgr.initHTTP()  // Creates HTTP server.
	mgr.collectUsedFiles()

	// Create RPC server for fuzzers.
	mgr.serv, err = startRPCServer(mgr)
	if err != nil {
		log.Fatalf("failed to create rpc server: %v", err)
	}

	if cfg.DashboardAddr != "" {
		// ...
	}

	go func() { // 「2」
		for lastTime := time.Now(); ; {
			time.Sleep(10 * time.Second)
			now := time.Now()
			diff := now.Sub(lastTime)
			lastTime = now
			mgr.mu.Lock()
			if mgr.firstConnect.IsZero() {
				mgr.mu.Unlock()
				continue
			}
			mgr.fuzzingTime += diff * time.Duration(atomic.LoadUint32(&mgr.numFuzzing))
			executed := mgr.stats.execTotal.get()
			crashes := mgr.stats.crashes.get()
			corpusCover := mgr.stats.corpusCover.get()
			corpusSignal := mgr.stats.corpusSignal.get()
			maxSignal := mgr.stats.maxSignal.get()
			mgr.mu.Unlock()
			numReproducing := atomic.LoadUint32(&mgr.numReproducing)
			numFuzzing := atomic.LoadUint32(&mgr.numFuzzing)

			log.Logf(0, "VMs %v, executed %v, cover %v, signal %v/%v, crashes %v, repro %v",
				numFuzzing, executed, corpusCover, corpusSignal, maxSignal, crashes, numReproducing)
		}
	}()

	if *flagBench != "" { 
	// ...
	}

	if mgr.dash != nil {
		go mgr.dashboardReporter()
	}

	osutil.HandleInterrupts(vm.Shutdown)
	if mgr.vmPool == nil {
		log.Logf(0, "no VMs started (type=none)")
		log.Logf(0, "you are supposed to start syz-fuzzer manually as:")
		log.Logf(0, "syz-fuzzer -manager=manager.ip:%v [other flags as necessary]", mgr.serv.port)
		<-vm.Shutdown
		return
	}
	mgr.vmLoop() // 「3」
}

「1」 ,调用 vm.Create 创建 vmPool,顾名思义,vmPool 是用于管理虚拟机的一个池,这里不同的虚拟化方案都使用统一的接口来创建 vmPool,具体随后细述
「2」,开启一个 goroutine,定时记录虚拟机状态,覆盖率,crash 数量等信息
「3」,调用 mgr.vmLoop 开始启动虚拟机进行 fuzz,这个函数涵盖了大量的工作

vm.Create

// vm/vm.go

// Create creates a VM pool that can be used to create individual VMs.
func Create(cfg *mgrconfig.Config, debug bool) (*Pool, error) {
	typ, ok := vmimpl.Types[cfg.Type] // 「1」
	if !ok {
		return nil, fmt.Errorf("unknown instance type '%v'", cfg.Type)
	}
	env := &vmimpl.Env{
		Name:     cfg.Name,
		OS:       cfg.TargetOS,
		Arch:     cfg.TargetVMArch,
		Workdir:  cfg.Workdir,
		Image:    cfg.Image,
		SSHKey:   cfg.SSHKey,
		SSHUser:  cfg.SSHUser,
		Timeouts: cfg.Timeouts,
		Debug:    debug,
		Config:   cfg.VM,
	}
	impl, err := typ.Ctor(env) // 「2」
	if err != nil {
		return nil, err
	}
	return &Pool{
		impl:     impl,
		workdir:  env.Workdir,
		template: cfg.WorkdirTemplate,
		timeouts: cfg.Timeouts,
	}, nil
}

「1」,根据配置文件的 type 字段获取对应 Type 对象为后续使用,type 字段表示虚拟机的类型,如 qemu,vmware

// vm/vmimpl/vmimpl.go

type Type struct {
	Ctor       ctorFunc
	Overcommit bool
}

var (
// ...
	Types = make(map[string]Type)
)

// Register registers a new VM type within the package.
func Register(typ string, ctor ctorFunc, allowsOvercommit bool) {
	Types[typ] = Type{
		Ctor:       ctor,
		Overcommit: allowsOvercommit,
	}
}

各种虚拟化对应的模块调用 Register 函数注册 Type 对象,以 qemu 为例

// vm/qemu/qemu.go

func init() {
	var _ vmimpl.Infoer = (*instance)(nil)
	vmimpl.Register("qemu", ctor, true)
}

「2」,调用对应 Type 对象的 Ctor 函数指针创建实现了 Pool 接口的对象,以 qemu 为例,观看 qemu.ctor 函数实现

// vm/vmimpl/vmimpl.go
type Pool interface {
	// Count returns total number of VMs in the pool.
	Count() int

	// Create creates and boots a new VM instance.
	Create(workdir string, index int) (Instance, error)
}

// vm/qemu/qemu.go

type Pool struct {
	env        *vmimpl.Env
	cfg        *Config
	target     *targets.Target
	archConfig *archConfig
	version    string
}

func ctor(env *vmimpl.Env) (vmimpl.Pool, error) {
	archConfig := archConfigs[env.OS+"/"+env.Arch]
	cfg := &Config{
		Count:       1,
		CPU:         1,
		Mem:         1024,
		ImageDevice: "hda",
		Qemu:        archConfig.Qemu,
		QemuArgs:    archConfig.QemuArgs,
		NetDev:      archConfig.NetDev,
		Snapshot:    true,
	}
	// ...

	pool := &Pool{
		env:        env,
		cfg:        cfg,
		version:    version,
		target:     targets.Get(env.OS, env.Arch),
		archConfig: archConfig,
	}
	return pool, nil
}

mgr.vmLoop

// syz-manager/manager.go

func (mgr *Manager) vmLoop() {
// ...
	for shutdown != nil || len(instances) != vmCount {
		mgr.mu.Lock()
		phase := mgr.phase
		mgr.mu.Unlock()
// ...
		log.Logf(1, "loop: phase=%v shutdown=%v instances=%v/%v %+v repro: pending=%v reproducing=%v queued=%v",
			phase, shutdown == nil, len(instances), vmCount, instances,
			len(pendingRepro), len(reproducing), len(reproQueue))

		canRepro := func() bool {
			return phase >= phaseTriagedHub &&
				len(reproQueue) != 0 && reproInstances+instancesPerRepro <= vmCount
		}

		if shutdown != nil { 
			for canRepro() && len(instances) >= instancesPerRepro { // 「1」
				last := len(reproQueue) - 1
				crash := reproQueue[last]
				reproQueue[last] = nil
				reproQueue = reproQueue[:last]
				vmIndexes := append([]int{}, instances[len(instances)-instancesPerRepro:]...)
				instances = instances[:len(instances)-instancesPerRepro]
				reproInstances += instancesPerRepro
				atomic.AddUint32(&mgr.numReproducing, 1)
				log.Logf(1, "loop: starting repro of '%v' on instances %+v", crash.Title, vmIndexes)
				go func() {
					features := mgr.checkResult.Features
					res, stats, err := repro.Run(crash.Output, mgr.cfg, features, mgr.reporter, mgr.vmPool, vmIndexes)
					reproDone <- &ReproResult{
						instances: vmIndexes,
						report0:   crash.Report,
						res:       res,
						stats:     stats,
						err:       err,
						hub:       crash.hub,
					}
				}()
			}
			for !canRepro() && len(instances) != 0 { // 「2」
				last := len(instances) - 1
				idx := instances[last]
				instances = instances[:last]
				log.Logf(1, "loop: starting instance %v", idx)
				go func() {
					crash, err := mgr.runInstance(idx)
					runDone <- &RunResult{idx, crash, err}
				}()
			}
		}

// ...
	}
}

「1」,存在需要复现的 crash,且有足够的虚拟机,则取出 instancesPerRepro (4 与 vmCount 的最小值) 个虚拟机以供调用 repro.Run 复现 crash

「2」,没有 crash 且还有虚拟机可用,取出一个虚拟机调用 mgr.runInstance 进行 fuzz

mgr.runInstance
// syz-manager/manager.go

func (mgr *Manager) runInstance(index int) (*Crash, error) {
	mgr.checkUsedFiles()
	instanceName := fmt.Sprintf("vm-%d", index)

	rep, vmInfo, err := mgr.runInstanceInner(index, instanceName) 

	machineInfo := mgr.serv.shutdownInstance(instanceName)
	if len(vmInfo) != 0 {
		machineInfo = append(append(vmInfo, '\n'), machineInfo...)
	}
	
// ...

	crash := &Crash{
		vmIndex:     index,
		hub:         false,
		Report:      rep,
		machineInfo: machineInfo,
	}
	return crash, nil
}

调用 mgr.runInstanceInner 后,整理输出返回

// syz-manager/manager.go

func (mgr *Manager) runInstanceInner(index int, instanceName string) (*report.Report, []byte, error) {
	inst, err := mgr.vmPool.Create(index)
	// ...
	
	fuzzerBin, err := inst.Copy(mgr.cfg.FuzzerBin) // 「1」
	// ...

	// If ExecutorBin is provided, it means that syz-executor is already in the image,
	// so no need to copy it.
	executorBin := mgr.sysTarget.ExecutorBin
	if executorBin == "" {
		executorBin, err = inst.Copy(mgr.cfg.ExecutorBin) // 「2」
		
		// ...
	}

// ...

	cmd := instance.FuzzerCmd(fuzzerBin, executorBin, instanceName,
		mgr.cfg.TargetOS, mgr.cfg.TargetArch, fwdAddr, mgr.cfg.Sandbox, procs, fuzzerV,
		mgr.cfg.Cover, *flagDebug, false, false, true, mgr.cfg.Timeouts.Slowdown) 
	outc, errc, err := inst.Run(mgr.cfg.Timeouts.VMRunningTime, mgr.vmStop, cmd) // 「3」
// ...

	var vmInfo []byte
	rep := inst.MonitorExecution(outc, errc, mgr.reporter, vm.ExitTimeout)4if rep == nil {
	// ...
	} else {
		vmInfo, err = inst.Info()
		if err != nil {
		// ...
		}
	}

	return rep, vmInfo, nil
}

「1」,复制 syz-fuzzer 到虚拟机中

「2」,复制 syz-executor 到虚拟机中

「3」,ssh 执行虚拟机里的 syz-fuzzer

// vm/qemu/qemu.go

func (inst *instance) Run(timeout time.Duration, stop <-chan bool, command string) (
	<-chan []byte, <-chan error, error) {
	// ...

	sshArgs := vmimpl.SSHArgsForward(inst.debug, inst.sshkey, inst.port, inst.forwardPort)
	// ...
	
	} else {
		args = []string{"ssh"}
		args = append(args, sshArgs...)
		args = append(args, inst.sshuser+"@localhost", "cd "+inst.targetDir()+" && "+command)
	}
	if inst.debug {
		log.Logf(0, "running command: %#v", args)
	}
	cmd := osutil.Command(args[0], args[1:]...)
	// ...
}

「4」,监控虚拟机 oops 信息,用 bytes.Contains 检查是否有特征序列来查看是否出现了 crash

repro.Run

最主要就是调用了 ctx.repro

// pkg/repro/repro.go

func (ctx *context) repro(entries []*prog.LogEntry, crashStart int) (*Result, error) {
// ...

	res, err := ctx.extractProg(entries) // 「1」

// ...

	res, err = ctx.minimizeProg(res) // 「2」

// ...

	// Try extracting C repro without simplifying options first.
	res, err = ctx.extractC(res) // 「3」

// ...

	// Simplify options and try extracting C repro.
	if !res.CRepro {
		res, err = ctx.simplifyProg(res) // 「4」
// ...
	}

	// Simplify C related options.
	if res.CRepro {
		res, err = ctx.simplifyC(res) // 「5」
// ...
	}

	return res, nil
}

「1」,提取出造成 crash 的程序

// pkg/repro/repro.go

func (ctx *context) extractProg(entries []*prog.LogEntry) (*Result, error) {
// ...

	// Extract last program on every proc.
	procs := make(map[int]int)
	for i, ent := range entries {
		procs[ent.Proc] = i
	}
	var indices []int
	for _, idx := range procs {
		indices = append(indices, idx)
	}
	sort.Ints(indices)
	var lastEntries []*prog.LogEntry
	for i := len(indices) - 1; i >= 0; i-- { // 倒序存储执行的程序
		lastEntries = append(lastEntries, entries[indices[i]])
	}
	for _, timeout := range ctx.testTimeouts {
		// Execute each program separately to detect simple crashes caused by a single program.
		// Programs are executed in reverse order, usually the last program is the guilty one.
		res, err := ctx.extractProgSingle(lastEntries, timeout)  // 一个一个执行单个程序,直到触发 crash 
		if err != nil {
			return nil, err
		}
		if res != nil {
			ctx.reproLogf(3, "found reproducer with %d syscalls", len(res.Prog.Calls))
			return res, nil
		}

		// Don't try bisecting if there's only one entry.
		if len(entries) == 1 {
			continue
		}

		// Execute all programs and bisect the log to find multiple guilty programs.
		res, err = ctx.extractProgBisect(entries, timeout) // 单个程序无法触发 crash,可能需要多个程序
		
// ...
	}

// ...
}

「2」,简化调用和参数

// pkg/repro/repro.go

func (ctx *context) minimizeProg(res *Result) (*Result, error) {
// ...
	res.Prog, _ = prog.Minimize(res.Prog, -1, true,
		func(p1 *prog.Prog, callIndex int) bool {
			crashed, err := ctx.testProg(p1, res.Duration, res.Opts)
			if err != nil {
				ctx.reproLogf(0, "minimization failed with %v", err)
				return false
			}
			return crashed
		})

	return res, nil
}

// prog/minimization.go

func Minimize(p0 *Prog, callIndex0 int, crash bool, pred0 func(*Prog, int) bool) (*Prog, int) {
	pred := func(p *Prog, callIndex int) bool {
		p.sanitizeFix()
		p.debugValidate()
		return pred0(p, callIndex)
	}
// ...

	// Try to remove all calls except the last one one-by-one.
	p0, callIndex0 = removeCalls(p0, callIndex0, crash, pred)

	// Try to reset all call props to their default values.
	p0 = resetCallProps(p0, callIndex0, pred)
	

	// Try to minimize individual calls.
	for i := 0; i < len(p0.Calls); i++ { // 去除系统调用无关的参数
		ctx := &minimizeArgsCtx{
			target:     p0.Target,
			p0:         &p0,
			callIndex0: callIndex0,
			crash:      crash,
			pred:       pred,
			triedPaths: make(map[string]bool),
		}
	again:
		ctx.p = p0.Clone()
		ctx.call = ctx.p.Calls[i]
		for j, field := range ctx.call.Meta.Args {
			if ctx.do(ctx.call.Args[j], field.Name, "") {
				goto again
			}
		}
		p0 = minimizeCallProps(p0, i, callIndex0, pred)
	}
// ...
	return p0, callIndex0
}

「3」,根据 crash 构造 C 代码

「4」「5」,简化构造成功的 C 代码

参考