Skip to content

Commit

Permalink
Checkpoint/Restore Support: add functionality to daemon
Browse files Browse the repository at this point in the history
Support was added to the daemon to use the Checkpoint and Restore methods
of the native exec driver for checkpointing and restoring containers.

Signed-off-by: Saied Kazemi <[email protected]>

Conflicts:
	api/server/server.go
	daemon/container.go
	daemon/daemon.go
	daemon/networkdriver/bridge/driver.go
	daemon/state.go
	vendor/src/github.com/docker/libnetwork/ipallocator/allocator.go

Conflicts:
	api/server/server.go
  • Loading branch information
Saied Kazemi authored and boucher committed Oct 1, 2015
1 parent cdf1ce2 commit c19d362
Show file tree
Hide file tree
Showing 7 changed files with 321 additions and 3 deletions.
30 changes: 30 additions & 0 deletions api/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,36 @@ func (s *HTTPServer) Close() error {
return s.l.Close()
}

func postContainersCheckpoint(eng *engine.Engine, version version.Version, w http.ResponseWriter, r *http.Request, vars map[string]string) error {
if vars == nil {
return fmt.Errorf("Missing parameter")
}
if err := parseForm(r); err != nil {
return err
}
job := eng.Job("checkpoint", vars["name"])
if err := job.Run(); err != nil {
return err
}
w.WriteHeader(http.StatusNoContent)
return nil
}

func postContainersRestore(eng *engine.Engine, version version.Version, w http.ResponseWriter, r *http.Request, vars map[string]string) error {
if vars == nil {
return fmt.Errorf("Missing parameter")
}
if err := parseForm(r); err != nil {
return err
}
job := eng.Job("restore", vars["name"])
if err := job.Run(); err != nil {
return err
}
w.WriteHeader(http.StatusNoContent)
return nil
}

func writeCorsHeaders(w http.ResponseWriter, r *http.Request, corsHeaders string) {
logrus.Debugf("CORS header is enabled and set to: %s", corsHeaders)
w.Header().Add("Access-Control-Allow-Origin", corsHeaders)
Expand Down
55 changes: 55 additions & 0 deletions daemon/checkpoint.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package daemon

import (
"github.com/docker/docker/engine"
)

// Checkpoint a running container.
func (daemon *Daemon) ContainerCheckpoint(job *engine.Job) engine.Status {
if len(job.Args) != 1 {
return job.Errorf("Usage: %s CONTAINER\n", job.Name)
}

name := job.Args[0]
container, err := daemon.Get(name)
if err != nil {
return job.Error(err)
}
if !container.IsRunning() {
return job.Errorf("Container %s not running", name)
}

if err := container.Checkpoint(); err != nil {
return job.Errorf("Cannot checkpoint container %s: %s", name, err)
}

container.LogEvent("checkpoint")
return engine.StatusOK
}

// Restore a checkpointed container.
func (daemon *Daemon) ContainerRestore(job *engine.Job) engine.Status {
if len(job.Args) != 1 {
return job.Errorf("Usage: %s CONTAINER\n", job.Name)
}

name := job.Args[0]
container, err := daemon.Get(name)
if err != nil {
return job.Error(err)
}
if container.IsRunning() {
return job.Errorf("Container %s already running", name)
}
if !container.State.IsCheckpointed() {
return job.Errorf("Container %s is not checkpointed", name)
}

if err := container.Restore(); err != nil {
container.LogEvent("die")
return job.Errorf("Cannot restore container %s: %s", name, err)
}

container.LogEvent("restore")
return engine.StatusOK
}
66 changes: 64 additions & 2 deletions daemon/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -332,10 +332,15 @@ func (container *Container) isNetworkAllocated() bool {
return container.NetworkSettings.IPAddress != ""
}


// cleanup releases any network resources allocated to the container along with any rules
// around how containers are linked together. It also unmounts the container's root filesystem.
func (container *Container) cleanup() {
container.releaseNetwork()
if container.IsCheckpointed() {
log.CRDbg("not calling ReleaseNetwork() for checkpointed container %s", container.ID)
} else {
container.ReleaseNetwork()
}

if err := container.unmountIpcMounts(); err != nil {
logrus.Errorf("%s: Failed to umount ipc filesystems: %v", container.ID, err)
Expand Down Expand Up @@ -677,6 +682,41 @@ func (container *Container) copy(resource string) (rc io.ReadCloser, err error)
return reader, nil
}

func (container *Container) Checkpoint() error {
return container.daemon.Checkpoint(container)
}

func (container *Container) Restore() error {
var err error

container.Lock()
defer container.Unlock()

defer func() {
if err != nil {
container.cleanup()
}
}()

if err = container.initializeNetworking(); err != nil {
return err
}

linkedEnv, err := container.setupLinkedContainers()
if err != nil {
return err
}
if err = container.setupWorkingDirectory(); err != nil {
return err
}
env := container.createDaemonEnvironment(linkedEnv)
if err = populateCommandRestore(container, env); err != nil {
return err
}

return container.waitForRestore()
}

// Returns true if the container exposes a certain port
func (container *Container) exposes(p nat.Port) bool {
_, exists := container.Config.ExposedPorts[p]
Expand Down Expand Up @@ -766,6 +806,29 @@ func (container *Container) waitForStart() error {
return nil
}

// Like waitForStart() but for restoring a container.
//
// XXX Does RestartPolicy apply here?
func (container *Container) waitForRestore() error {
container.monitor = newContainerMonitor(container, container.hostConfig.RestartPolicy)

// After calling promise.Go() we'll have two goroutines:
// - The current goroutine that will block in the select
// below until restore is done.
// - A new goroutine that will restore the container and
// wait for it to exit.
select {
case <-container.monitor.restoreSignal:
if container.ExitCode != 0 {
return fmt.Errorf("restore process failed")
}
case err := <-promise.Go(container.monitor.Restore):
return err
}

return nil
}

func (container *Container) getProcessLabel() string {
// even if we have a process label return "" if we are running
// in privileged mode
Expand Down Expand Up @@ -966,7 +1029,6 @@ func attach(streamConfig *streamConfig, openStdin, stdinOnce, tty bool, stdin io
_, err = copyEscapable(cStdin, stdin)
} else {
_, err = io.Copy(cStdin, stdin)

}
if err == io.ErrClosedPipe {
err = nil
Expand Down
49 changes: 48 additions & 1 deletion daemon/container_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,54 @@ func mergeDevices(defaultDevices, userDevices []*configs.Device) []*configs.Devi
return append(devs, userDevices...)
}

// GetSize returns the real size & virtual size of the container.
// Like populateCommand() but for restoring a container.
//
// XXX populateCommand() does a lot more. Not sure if we have
// to do everything it does.
func populateCommandRestore(c *Container, env []string) error {
resources := &execdriver.Resources{
Memory: c.Config.Memory,
MemorySwap: c.Config.MemorySwap,
CpuShares: c.Config.CpuShares,
Cpuset: c.Config.Cpuset,
}

processConfig := execdriver.ProcessConfig{
Privileged: c.hostConfig.Privileged,
Entrypoint: c.Path,
Arguments: c.Args,
Tty: c.Config.Tty,
User: c.Config.User,
}

processConfig.SysProcAttr = &syscall.SysProcAttr{Setsid: true}
processConfig.Env = env

c.command = &execdriver.Command{
ID: c.ID,
Rootfs: c.RootfsPath(),
ReadonlyRootfs: c.hostConfig.ReadonlyRootfs,
InitPath: "/.dockerinit",
WorkingDir: c.Config.WorkingDir,
// Network: en,
// Ipc: ipc,
// Pid: pid,
Resources: resources,
// AllowedDevices: allowedDevices,
// AutoCreatedDevices: autoCreatedDevices,
CapAdd: c.hostConfig.CapAdd,
CapDrop: c.hostConfig.CapDrop,
ProcessConfig: processConfig,
ProcessLabel: c.GetProcessLabel(),
MountLabel: c.GetMountLabel(),
// LxcConfig: lxcConfig,
AppArmorProfile: c.AppArmorProfile,
}

return nil
}

// GetSize, return real size, virtual size
func (container *Container) getSize() (int64, int64) {
var (
sizeRw, sizeRootfs int64
Expand Down
31 changes: 31 additions & 0 deletions daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,18 @@ func (daemon *Daemon) restore() error {
logrus.Debugf("Loaded container %v", container.ID)

containers[container.ID] = &cr{container: container}

// If the container was checkpointed, we need to reserve
// the IP address that it was using.
//
// XXX We should also reserve host ports (if any).
if container.IsCheckpointed() {
/*err = bridge.ReserveIP(container.ID, container.NetworkSettings.IPAddress)
if err != nil {
log.Errorf("Failed to reserve IP %s for container %s",
container.ID, container.NetworkSettings.IPAddress)
}*/
}
} else {
logrus.Debugf("Cannot load container %s because it was created with another graph driver.", container.ID)
}
Expand Down Expand Up @@ -911,6 +923,25 @@ func (daemon *Daemon) run(c *Container, pipes *execdriver.Pipes, startCallback e
return daemon.execDriver.Run(c.command, pipes, hooks)
}

func (daemon *Daemon) Checkpoint(c *Container) error {
if err := daemon.execDriver.Checkpoint(c.command); err != nil {
return err
}
c.SetCheckpointed()
return nil
}

func (daemon *Daemon) Restore(c *Container, pipes *execdriver.Pipes, restoreCallback execdriver.RestoreCallback) (int, error) {
// Mount the container's filesystem (daemon/graphdriver/aufs/aufs.go).
_, err := daemon.driver.Get(c.ID, c.GetMountLabel())
if err != nil {
return 0, err
}

exitCode, err := daemon.execDriver.Restore(c.command, pipes, restoreCallback)
return exitCode, err
}

func (daemon *Daemon) kill(c *Container, sig int) error {
return daemon.execDriver.Kill(c.command, sig)
}
Expand Down
70 changes: 70 additions & 0 deletions daemon/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ type containerMonitor struct {
// left waiting for nothing to happen during this time
stopChan chan struct{}

// like startSignal but for restoring a container
restoreSignal chan struct{}

// timeIncrement is the amount of time to wait between restarts
// this is in milliseconds
timeIncrement int
Expand All @@ -64,6 +67,7 @@ func newContainerMonitor(container *Container, policy runconfig.RestartPolicy) *
timeIncrement: defaultTimeIncrement,
stopChan: make(chan struct{}),
startSignal: make(chan struct{}),
restoreSignal: make(chan struct{}),
}
}

Expand Down Expand Up @@ -183,6 +187,49 @@ func (m *containerMonitor) Start() error {
}
}

// Like Start() but for restoring a container.
func (m *containerMonitor) Restore() error {
var (
err error
// XXX The following line should be changed to
// exitStatus execdriver.ExitStatus to match Start()
exitCode int
afterRestore bool
)

defer func() {
if afterRestore {
m.container.Lock()
m.container.setStopped(&execdriver.ExitStatus{exitCode, false})
defer m.container.Unlock()
}
m.Close()
}()

if err := m.container.startLoggingToDisk(); err != nil {
m.resetContainer(false)
return err
}

pipes := execdriver.NewPipes(m.container.stdin, m.container.stdout, m.container.stderr, m.container.Config.OpenStdin)

m.container.LogEvent("restore")
m.lastStartTime = time.Now()
if exitCode, err = m.container.daemon.Restore(m.container, pipes, m.restoreCallback); err != nil {
log.Errorf("Error restoring container: %s, exitCode=%d", err, exitCode)
m.container.ExitCode = -1
m.resetContainer(false)
return err
}
afterRestore = true

m.container.ExitCode = exitCode
m.resetMonitor(err == nil && exitCode == 0)
m.container.LogEvent("die")
m.resetContainer(true)
return err
}

// resetMonitor resets the stateful fields on the containerMonitor based on the
// previous runs success or failure. Regardless of success, if the container had
// an execution time of more than 10s then reset the timer back to the default
Expand Down Expand Up @@ -278,6 +325,29 @@ func (m *containerMonitor) callback(processConfig *execdriver.ProcessConfig, pid
return nil
}

// Like callback() but for restoring a container.
func (m *containerMonitor) restoreCallback(processConfig *execdriver.ProcessConfig, restorePid int) {
// If restorePid is 0, it means that restore failed.
if restorePid != 0 {
m.container.setRunning(restorePid)
}

// Unblock the goroutine waiting in waitForRestore().
select {
case <-m.restoreSignal:
default:
close(m.restoreSignal)
}

if restorePid != 0 {
// Write config.json and hostconfig.json files
// to /var/lib/docker/containers/<ID>.
if err := m.container.ToDisk(); err != nil {
log.Debugf("%s", err)
}
}
}

// resetContainer resets the container's IO and ensures that the command is able to be executed again
// by copying the data into a new struct
// if lock is true, then container locked during reset
Expand Down
Loading

0 comments on commit c19d362

Please sign in to comment.