linsk/vm/vm.go

484 lines
11 KiB
Go
Raw Normal View History

2023-08-25 15:12:19 +01:00
package vm
import (
"bufio"
"bytes"
"context"
"fmt"
"io"
"os/exec"
"sync"
"sync/atomic"
"time"
2023-08-25 16:54:58 +01:00
"log/slog"
2023-09-01 11:44:49 +01:00
"github.com/AlexSSD7/linsk/osspecifics"
2023-09-02 11:10:14 +01:00
"github.com/AlexSSD7/linsk/qemucli"
2023-08-29 14:24:18 +01:00
"github.com/AlexSSD7/linsk/sshutil"
2023-08-27 15:53:44 +01:00
"github.com/AlexSSD7/linsk/utils"
2023-08-26 16:26:35 +01:00
"github.com/bramvdbogaerde/go-scp"
2023-08-25 15:12:19 +01:00
"github.com/phayes/freeport"
"github.com/pkg/errors"
"go.uber.org/multierr"
"golang.org/x/crypto/ssh"
)
2023-08-27 13:44:57 +01:00
type VM struct {
2023-08-25 16:54:58 +01:00
logger *slog.Logger
2023-08-25 15:12:19 +01:00
ctx context.Context
ctxCancel context.CancelFunc
cmd *exec.Cmd
sshMappedPort uint16
sshConf *ssh.ClientConfig
sshReadyCh chan struct{}
2023-08-27 15:30:51 +01:00
installSSH bool
2023-08-25 15:12:19 +01:00
serialRead *io.PipeReader
serialReader *bufio.Reader
serialWrite *io.PipeWriter
serialWriteMu sync.Mutex
2023-08-29 11:51:06 +01:00
qemuStderrBuf *bytes.Buffer
osUpTimeout time.Duration
sshUpTimeout time.Duration
2023-08-25 15:12:19 +01:00
serialStdoutCh chan []byte
// These are to be interacted with using `atomic` package
disposed uint32
canceled uint32
2023-08-29 15:31:17 +01:00
originalCfg Config
2023-08-25 15:12:19 +01:00
}
2023-08-27 15:30:51 +01:00
type DriveConfig struct {
Path string
SnapshotMode bool
}
type TapConfig struct {
Name string
}
type Config struct {
2023-08-27 13:44:57 +01:00
CdromImagePath string
2023-08-30 13:13:08 +01:00
BIOSPath string
2023-08-27 15:53:44 +01:00
Drives []DriveConfig
2023-08-27 13:44:57 +01:00
2023-08-29 11:51:06 +01:00
MemoryAlloc uint32 // In KiB.
2023-08-29 10:59:50 +01:00
2023-08-29 15:31:17 +01:00
PassthroughConfig PassthroughConfig
2023-08-27 13:44:57 +01:00
ExtraPortForwardingRules []PortForwardingRule
// Networking
UnrestrictedNetworking bool
Taps []TapConfig
2023-08-29 11:51:06 +01:00
// Timeouts
OSUpTimeout time.Duration
SSHUpTimeout time.Duration
2023-08-27 15:30:51 +01:00
// Mostly debug-related options.
ShowDisplay bool
InstallBaseUtilities bool
2023-08-27 13:44:57 +01:00
}
func NewVM(logger *slog.Logger, cfg Config) (*VM, error) {
2023-08-25 15:12:19 +01:00
sshPort, err := freeport.GetFreePort()
if err != nil {
return nil, errors.Wrap(err, "get free port for ssh server")
}
2023-09-02 11:10:14 +01:00
baseCmd, cmdArgs, err := configureBaseVMCmd(logger, cfg)
if err != nil {
return nil, errors.Wrap(err, "configure base vm cmd")
}
2023-09-02 11:10:14 +01:00
netCmdArgs, err := configureVMCmdNetworking(logger, cfg, uint16(sshPort))
if err != nil {
return nil, errors.Wrap(err, "configure vm cmd networking")
2023-08-25 15:12:19 +01:00
}
2023-09-02 11:10:14 +01:00
cmdArgs = append(cmdArgs, netCmdArgs...)
2023-08-30 13:32:00 +01:00
2023-09-02 11:10:14 +01:00
driveCmdArgs, err := configureVMCmdDrives(cfg)
if err != nil {
return nil, errors.Wrap(err, "configure vm cmd drives")
2023-08-29 15:31:17 +01:00
}
2023-09-02 11:10:14 +01:00
cmdArgs = append(cmdArgs, driveCmdArgs...)
2023-08-29 15:31:17 +01:00
2023-09-02 11:10:14 +01:00
usbCmdArgs := configureVMCmdUSBPassthrough(cfg)
2023-08-29 15:31:17 +01:00
2023-09-02 11:10:14 +01:00
cmdArgs = append(cmdArgs, usbCmdArgs...)
2023-08-29 15:37:09 +01:00
2023-09-02 11:10:14 +01:00
blockDevArgs, err := configureVMCmdBlockDevicePassthrough(logger, cfg)
if err != nil {
return nil, errors.Wrap(err, "configure vm cmd block device passthrough")
2023-08-27 15:30:51 +01:00
}
2023-09-02 11:10:14 +01:00
cmdArgs = append(cmdArgs, blockDevArgs...)
2023-08-27 15:30:51 +01:00
if cfg.InstallBaseUtilities && !cfg.UnrestrictedNetworking {
2023-09-01 16:40:13 +01:00
return nil, fmt.Errorf("installation of base utilities is impossible with unrestricted networking disabled")
2023-08-27 15:30:51 +01:00
}
2023-08-29 11:51:06 +01:00
// NOTE: The default timeouts below have no relation to the default
// timeouts set by the CLI. These work only if no timeout was supplied
// in the config programmatically. Defaults set here are quite conservative.
osUpTimeout := time.Second * 60
if cfg.OSUpTimeout != 0 {
osUpTimeout = cfg.OSUpTimeout
}
sshUpTimeout := time.Second * 120
if cfg.SSHUpTimeout != 0 {
sshUpTimeout = cfg.SSHUpTimeout
}
if sshUpTimeout < osUpTimeout {
return nil, fmt.Errorf("vm ssh setup timeout cannot be lower than os up timeout")
}
2023-09-02 11:10:14 +01:00
encodedCmdArgs, err := qemucli.EncodeArgs(cmdArgs)
if err != nil {
return nil, errors.Wrap(err, "encode qemu cli args")
}
2023-08-29 11:51:06 +01:00
// No errors beyond this point.
2023-08-25 15:12:19 +01:00
sysRead, userWrite := io.Pipe()
userRead, sysWrite := io.Pipe()
2023-09-02 12:14:02 +01:00
cmd := exec.Command(baseCmd, encodedCmdArgs...) //#nosec G204 // I know, it's generally a bad idea to include variables into shell commands, but QEMU unfortunately does not accept anything else.
2023-08-25 15:12:19 +01:00
cmd.Stdin = sysRead
cmd.Stdout = sysWrite
stderrBuf := bytes.NewBuffer(nil)
cmd.Stderr = stderrBuf
2023-08-28 11:35:57 +02:00
// This function is OS-specific.
2023-09-01 11:44:49 +01:00
osspecifics.SetNewProcessGroupCmd(cmd)
2023-08-25 19:55:11 +01:00
2023-08-25 15:12:19 +01:00
userReader := bufio.NewReader(userRead)
ctx, ctxCancel := context.WithCancel(context.Background())
2023-08-27 13:44:57 +01:00
vm := &VM{
2023-08-25 15:12:19 +01:00
logger: logger,
ctx: ctx,
ctxCancel: ctxCancel,
cmd: cmd,
sshMappedPort: uint16(sshPort),
sshReadyCh: make(chan struct{}),
2023-08-27 15:30:51 +01:00
installSSH: cfg.InstallBaseUtilities,
2023-08-25 15:12:19 +01:00
2023-08-29 11:51:06 +01:00
serialRead: userRead,
serialReader: userReader,
serialWrite: userWrite,
qemuStderrBuf: stderrBuf,
osUpTimeout: osUpTimeout,
sshUpTimeout: sshUpTimeout,
2023-08-29 15:31:17 +01:00
originalCfg: cfg,
2023-08-25 15:12:19 +01:00
}
2023-08-27 13:44:57 +01:00
vm.resetSerialStdout()
2023-08-25 15:12:19 +01:00
2023-08-27 13:44:57 +01:00
return vm, nil
2023-08-25 15:12:19 +01:00
}
2023-08-27 13:44:57 +01:00
func (vm *VM) Run() error {
if atomic.AddUint32(&vm.disposed, 1) != 1 {
2023-08-25 15:12:19 +01:00
return fmt.Errorf("vm disposed")
}
2023-08-27 13:44:57 +01:00
err := vm.cmd.Start()
2023-08-25 15:12:19 +01:00
if err != nil {
return errors.Wrap(err, "start qemu cmd")
}
2023-08-29 15:31:17 +01:00
go vm.runPeriodicHostMountChecker()
2023-08-25 15:12:19 +01:00
var globalErrsMu sync.Mutex
var globalErrs []error
globalErrFn := func(err error) {
globalErrsMu.Lock()
defer globalErrsMu.Unlock()
2023-08-27 13:44:57 +01:00
globalErrs = append(globalErrs, err, errors.Wrap(vm.Cancel(), "cancel on error"))
2023-08-25 15:12:19 +01:00
}
2023-08-29 11:51:06 +01:00
bootReadyCh := make(chan struct{})
go func() {
select {
case <-time.After(vm.osUpTimeout):
2023-08-30 13:38:34 +01:00
vm.logger.Warn("A VM boot timeout detected, consider running with --vm-debug to investigate")
2023-08-29 11:51:06 +01:00
globalErrFn(fmt.Errorf("vm boot timeout %v", utils.GetLogErrMsg(string(vm.consumeSerialStdout()), "serial log")))
case <-bootReadyCh:
vm.logger.Info("The VM is up, setting it up")
}
}()
go func() {
select {
case <-time.After(vm.sshUpTimeout):
globalErrFn(fmt.Errorf("vm setup timeout %v", utils.GetLogErrMsg(string(vm.consumeSerialStdout()), "serial log")))
case <-vm.sshReadyCh:
vm.logger.Info("The VM is ready")
}
}()
2023-08-27 13:44:57 +01:00
vm.logger.Info("Booting the VM")
2023-08-25 15:12:19 +01:00
go func() {
2023-08-27 13:44:57 +01:00
_ = vm.runSerialReader()
_ = vm.Cancel()
2023-08-25 15:12:19 +01:00
}()
go func() {
2023-08-27 13:44:57 +01:00
err = vm.runVMLoginHandler()
2023-08-25 15:12:19 +01:00
if err != nil {
globalErrFn(errors.Wrap(err, "run vm login handler"))
return
}
2023-08-29 11:51:06 +01:00
// This will disable the timeout-handling goroutine.
close(bootReadyCh)
2023-08-25 16:54:58 +01:00
2023-08-27 13:44:57 +01:00
sshSigner, err := vm.sshSetup()
2023-08-25 15:12:19 +01:00
if err != nil {
globalErrFn(errors.Wrap(err, "set up ssh"))
return
}
2023-08-27 13:44:57 +01:00
vm.logger.Debug("Set up SSH server successfully")
2023-08-25 15:12:19 +01:00
2023-08-27 13:44:57 +01:00
sshKeyScan, err := vm.scanSSHIdentity()
2023-08-25 15:12:19 +01:00
if err != nil {
globalErrFn(errors.Wrap(err, "scan ssh identity"))
return
}
2023-08-27 13:44:57 +01:00
vm.logger.Debug("Scanned SSH identity")
2023-08-25 15:12:19 +01:00
knownHosts, err := ParseSSHKeyScan(sshKeyScan)
if err != nil {
globalErrFn(errors.Wrap(err, "parse ssh key scan"))
return
}
2023-08-27 13:44:57 +01:00
vm.sshConf = &ssh.ClientConfig{
2023-08-25 15:12:19 +01:00
User: "root",
HostKeyCallback: knownHosts,
Auth: []ssh.AuthMethod{
ssh.PublicKeys(sshSigner),
},
Timeout: time.Second * 5,
}
// This is to notify everyone waiting for SSH to be up that it's ready to go.
2023-08-27 13:44:57 +01:00
close(vm.sshReadyCh)
2023-08-25 15:12:19 +01:00
}()
2023-08-27 13:44:57 +01:00
_, err = vm.cmd.Process.Wait()
cancelErr := vm.Cancel()
2023-08-25 15:12:19 +01:00
if err != nil {
combinedErr := multierr.Combine(
errors.Wrap(err, "wait for cmd to finish execution"),
errors.Wrap(cancelErr, "cancel"),
)
2023-08-29 11:51:06 +01:00
return fmt.Errorf("%w %v", combinedErr, utils.GetLogErrMsg(vm.qemuStderrBuf.String(), "qemu stderr log"))
2023-08-25 15:12:19 +01:00
}
combinedErr := multierr.Combine(
append(globalErrs, errors.Wrap(cancelErr, "cancel on exit"))...,
)
2023-08-25 16:54:58 +01:00
if combinedErr != nil {
2023-08-29 11:51:06 +01:00
return fmt.Errorf("%w %v", combinedErr, utils.GetLogErrMsg(vm.qemuStderrBuf.String(), "qemu stderr log"))
2023-08-25 16:54:58 +01:00
}
2023-08-25 15:12:19 +01:00
2023-08-25 16:54:58 +01:00
return nil
2023-08-25 15:12:19 +01:00
}
2023-08-27 13:44:57 +01:00
func (vm *VM) Cancel() error {
if atomic.AddUint32(&vm.canceled, 1) != 1 {
2023-08-25 15:12:19 +01:00
return nil
}
2023-08-27 13:44:57 +01:00
vm.logger.Warn("Canceling the VM context")
2023-08-25 19:55:11 +01:00
var gracefulOK bool
2023-08-27 13:44:57 +01:00
sc, err := vm.DialSSH()
2023-08-25 19:55:11 +01:00
if err != nil {
if !errors.Is(err, ErrSSHUnavailable) {
2023-08-29 10:59:50 +01:00
vm.logger.Warn("Failed to dial VM SSH to do graceful shutdown", "error", err.Error())
2023-08-25 19:55:11 +01:00
}
} else {
2023-08-28 11:35:57 +02:00
vm.logger.Warn("Sending poweroff command to the VM")
2023-08-29 14:24:18 +01:00
_, err = sshutil.RunSSHCmd(context.Background(), sc, "poweroff")
2023-08-25 19:55:11 +01:00
_ = sc.Close()
if err != nil {
2023-08-29 10:59:50 +01:00
vm.logger.Warn("Could not power off the VM safely", "error", err.Error())
2023-08-25 19:55:11 +01:00
} else {
2023-08-27 13:44:57 +01:00
vm.logger.Info("Shutting the VM down safely")
2023-08-25 19:55:11 +01:00
}
}
var interruptErr error
if !gracefulOK {
2023-08-28 11:35:57 +02:00
if vm.cmd.Process == nil {
interruptErr = fmt.Errorf("process is not started")
} else {
2023-09-01 11:44:49 +01:00
interruptErr = osspecifics.TerminateProcess(vm.cmd.Process.Pid)
2023-08-28 11:35:57 +02:00
}
2023-08-25 19:55:11 +01:00
}
2023-08-27 13:44:57 +01:00
vm.ctxCancel()
2023-08-25 15:12:19 +01:00
return multierr.Combine(
2023-08-25 19:55:11 +01:00
errors.Wrap(interruptErr, "interrupt cmd"),
2023-08-27 13:44:57 +01:00
errors.Wrap(vm.serialRead.Close(), "close serial read pipe"),
errors.Wrap(vm.serialWrite.Close(), "close serial write pipe"),
2023-08-25 15:12:19 +01:00
)
}
2023-08-27 13:44:57 +01:00
func (vm *VM) runSerialReader() error {
2023-08-25 15:12:19 +01:00
for {
2023-08-27 13:44:57 +01:00
raw, err := vm.serialReader.ReadBytes('\n')
2023-08-25 15:12:19 +01:00
if err != nil {
return errors.Wrap(err, "read from serial reader")
}
select {
2023-08-27 13:44:57 +01:00
case vm.serialStdoutCh <- raw:
2023-08-25 15:12:19 +01:00
default:
// Message gets discarded if the buffer is full.
}
}
}
2023-08-27 13:44:57 +01:00
func (vm *VM) writeSerial(b []byte) error {
vm.serialWriteMu.Lock()
defer vm.serialWriteMu.Unlock()
2023-08-25 15:12:19 +01:00
2023-08-28 11:35:57 +02:00
// What do you see below is a workaround for the way how serial console
// is implemented in QEMU/Windows pair. Apparently they are using polling,
// and this will ensure that we do not write faster than the polling rate.
for i := range b {
_, err := vm.serialWrite.Write([]byte{b[i]})
2023-08-29 13:29:46 +01:00
time.Sleep(time.Millisecond)
2023-08-28 11:35:57 +02:00
if err != nil {
return errors.Wrapf(err, "write char #%v", i)
}
}
return nil
2023-08-25 15:12:19 +01:00
}
2023-08-27 13:44:57 +01:00
func (vm *VM) runVMLoginHandler() error {
2023-08-25 15:12:19 +01:00
for {
select {
2023-08-27 13:44:57 +01:00
case <-vm.ctx.Done():
2023-08-29 11:51:06 +01:00
return vm.ctx.Err()
2023-08-25 15:12:19 +01:00
case <-time.After(time.Second):
2023-08-27 13:44:57 +01:00
peek, err := vm.serialReader.Peek(vm.serialReader.Buffered())
2023-08-25 15:12:19 +01:00
if err != nil {
return errors.Wrap(err, "peek stdout")
}
if bytes.Contains(peek, []byte("login:")) {
2023-08-27 13:44:57 +01:00
err = vm.writeSerial([]byte("root\n"))
2023-08-25 15:12:19 +01:00
if err != nil {
2023-09-01 16:40:13 +01:00
return errors.Wrap(err, "stdio write login")
2023-08-25 15:12:19 +01:00
}
2023-08-27 13:44:57 +01:00
vm.logger.Debug("Logged into the VM serial")
2023-08-25 15:12:19 +01:00
return nil
}
}
}
}
2023-08-27 13:44:57 +01:00
func (vm *VM) resetSerialStdout() {
vm.serialStdoutCh = make(chan []byte, 32)
2023-08-25 15:12:19 +01:00
}
2023-08-29 11:51:06 +01:00
func (vm *VM) consumeSerialStdout() []byte {
buf := bytes.NewBuffer(nil)
for {
select {
case data := <-vm.serialStdoutCh:
buf.Write(data)
default:
return buf.Bytes()
}
}
}
2023-08-27 13:44:57 +01:00
func (vm *VM) DialSSH() (*ssh.Client, error) {
if vm.sshConf == nil {
2023-08-25 15:12:19 +01:00
return nil, ErrSSHUnavailable
}
2023-08-27 13:44:57 +01:00
return ssh.Dial("tcp", "localhost:"+fmt.Sprint(vm.sshMappedPort), vm.sshConf)
2023-08-25 15:12:19 +01:00
}
2023-08-27 13:44:57 +01:00
func (vm *VM) DialSCP() (*scp.Client, error) {
if vm.sshConf == nil {
2023-08-26 16:26:35 +01:00
return nil, ErrSSHUnavailable
}
2023-08-27 13:44:57 +01:00
sc := scp.NewClient("localhost:"+fmt.Sprint(vm.sshMappedPort), vm.sshConf)
2023-08-26 16:26:35 +01:00
err := sc.Connect()
if err != nil {
2023-09-01 16:40:13 +01:00
return nil, errors.Wrap(err, "connect")
2023-08-26 16:26:35 +01:00
}
return &sc, nil
}
2023-08-27 13:44:57 +01:00
func (vm *VM) SSHUpNotifyChan() chan struct{} {
return vm.sshReadyCh
2023-08-25 15:12:19 +01:00
}
2023-08-29 15:31:17 +01:00
// It's always a user's responsibility to ensure that no drives are mounted
// in both host and guest system. This should serve as the last resort.
func (vm *VM) runPeriodicHostMountChecker() {
if len(vm.originalCfg.PassthroughConfig.Block) == 0 {
return
}
for {
select {
case <-vm.ctx.Done():
return
case <-time.After(time.Second):
for _, dev := range vm.originalCfg.PassthroughConfig.Block {
2023-09-01 11:44:49 +01:00
seemsMounted, err := osspecifics.CheckDeviceSeemsMounted(dev.Path)
2023-08-29 15:31:17 +01:00
if err != nil {
vm.logger.Warn("Failed to check if a passed device seems to be mounted", "dev-path", dev.Path)
continue
}
if seemsMounted {
2023-08-29 15:37:09 +01:00
_ = vm.cmd.Process.Kill()
2023-08-29 15:31:17 +01:00
panic(fmt.Sprintf("CRITICAL: Passed-through device '%v' appears to have been mounted on the host OS. Forcefully exiting now to prevent data corruption.", dev.Path))
}
}
}
}
}