@@ -167,14 +167,17 @@ type loaderState int
167167const (
168168 // created indicates that the Loader has been created, but not started yet.
169169 created loaderState = iota
170- // started indicates that the Loader has been started.
170+ // started indicates that the Loader has been started. This means that the
171+ // root container is running. Subsequent containers may still be unstarted.
171172 started
172173 // restoringUnstarted indicates that the Loader has been created and is
173174 // restoring containers, but not started yet.
174175 restoringUnstarted
175- // restoringStarted indicates that the Loader has been created and started,
176- // while restore continues in the background.
176+ // restoringStarted indicates that the Loader has been created and started
177+ // along with all containers, while restore continues in the background.
177178 restoringStarted
179+ // restoreFailed indicates that the Loader has failed to restore.
180+ restoreFailed
178181 // restored indicates that the Loader has been fully restored.
179182 restored
180183)
@@ -190,6 +193,8 @@ func (s loaderState) String() string {
190193 return "restoringUnstarted"
191194 case restoringStarted :
192195 return "restoringStarted"
196+ case restoreFailed :
197+ return "restoreFailed"
193198 case restored :
194199 return "restored"
195200 default :
@@ -283,6 +288,11 @@ type Loader struct {
283288 // during restore.
284289 saveRestoreNet bool
285290
291+ // restoreErr is the error that occurred during restore.
292+ //
293+ // +checklocks:mu
294+ restoreErr error
295+
286296 LoaderExtra
287297}
288298
@@ -1007,8 +1017,8 @@ func (l *Loader) run() error {
10071017 return fmt .Errorf ("trying to start deleted container %q" , l .sandboxID )
10081018 }
10091019
1010- // If we are restoring, we do not want to create a process.
1011- if l . state != restoringUnstarted {
1020+ switch l . state {
1021+ case created :
10121022 if l .root .conf .ProfileEnable {
10131023 pprof .Initialize ()
10141024 }
@@ -1049,6 +1059,10 @@ func (l *Loader) run() error {
10491059 return c .ContainerStart (context .Background (), fields , & evt )
10501060 })
10511061 }
1062+ case restoringUnstarted :
1063+ // If we are restoring, we do not want to create a process.
1064+ default :
1065+ return fmt .Errorf ("Loader.Run() called in unexpected state=%s" , l .state )
10521066 }
10531067
10541068 ep .tg = l .k .GlobalInit ()
@@ -1083,10 +1097,13 @@ func (l *Loader) run() error {
10831097 if err := l .k .Start (); err != nil {
10841098 return err
10851099 }
1086- if l .state == restoringUnstarted {
1087- l .state = restoringStarted
1088- } else {
1100+ switch l .state {
1101+ case created :
10891102 l .state = started
1103+ case restoringUnstarted :
1104+ l .state = restoringStarted
1105+ default :
1106+ panic (fmt .Sprintf ("state=%s in Loader.run() should be impossible" , l .state ))
10901107 }
10911108 return nil
10921109}
@@ -1478,29 +1495,43 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
14781495
14791496// waitContainer waits for the init process of a container to exit.
14801497func (l * Loader ) waitContainer (cid string , waitStatus * uint32 ) error {
1481- // Don't defer unlock, as doing so would make it impossible for
1482- // multiple clients to wait on the same container.
1483- key := execID {cid : cid }
1484- tg , err := l .threadGroupFromID (key )
1485- if err != nil {
1486- l .mu .Lock ()
1487- // Extra handling is needed if the restoring container has not started yet.
1488- if l .state != restoringUnstarted {
1489- l .mu .Unlock ()
1490- return err
1491- }
1492- // Container could be restoring, first check if container exists.
1493- if _ , err := l .findProcessLocked (key ); err != nil {
1494- l .mu .Unlock ()
1495- return err
1496- }
1498+ l .mu .Lock ()
1499+ state := l .state
1500+ if state == restoringUnstarted {
14971501 log .Infof ("Waiting for the container to restore, CID: %q" , cid )
14981502 l .restoreDone .Wait ()
14991503 l .mu .Unlock ()
1500-
15011504 log .Infof ("Restore is completed, trying to wait for container %q again." , cid )
15021505 return l .waitContainer (cid , waitStatus )
15031506 }
1507+ tg , err := l .tryThreadGroupFromIDLocked (execID {cid : cid })
1508+ l .mu .Unlock ()
1509+ if err != nil {
1510+ // The container does not exist.
1511+ return err
1512+ }
1513+ if tg == nil {
1514+ // The container has not been started.
1515+ switch state {
1516+ case created , started :
1517+ // Note that state=started means the root container has been started,
1518+ // but other containers may not have started yet.
1519+ return fmt .Errorf ("container %q not started" , cid )
1520+ case restoringStarted , restored :
1521+ // The container has restored, we *should* have found the init process...
1522+ return fmt .Errorf ("could not find init process of restored container %q in state %q" , cid , state )
1523+ case restoreFailed :
1524+ // If restore failed, we should return the a non-zero exit status here to
1525+ // indicate that the container failed and transition to "stopped" state.
1526+ log .Warningf ("Restore failed, returning from waitContainer with non-zero exit status" )
1527+ * waitStatus = 1
1528+ return nil
1529+ case restoringUnstarted :
1530+ panic ("impossible" )
1531+ default :
1532+ panic (fmt .Sprintf ("Invalid state: %s" , state ))
1533+ }
1534+ }
15041535
15051536 // If the thread either has already exited or exits during waiting,
15061537 // consider the container exited.
@@ -1520,15 +1551,15 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
15201551func (l * Loader ) waitRestore () error {
15211552 l .mu .Lock ()
15221553 defer l .mu .Unlock ()
1523- if l .state == restored {
1524- return nil
1554+ if l .state == restored || l . state == restoreFailed {
1555+ return l . restoreErr
15251556 }
15261557 if l .state != restoringUnstarted && l .state != restoringStarted {
15271558 return fmt .Errorf ("sandbox is not being restored, cannot wait for restore: state=%s" , l .state )
15281559 }
15291560 log .Infof ("Waiting for the sandbox to restore" )
15301561 l .restoreDone .Wait ()
1531- return nil
1562+ return l . restoreErr
15321563}
15331564
15341565func (l * Loader ) waitPID (tgid kernel.ThreadID , cid string , waitStatus * uint32 ) error {
@@ -2097,7 +2128,10 @@ func (l *Loader) containerRuntimeState(cid string) ContainerRuntimeState {
20972128 return RuntimeStateStopped
20982129 }
20992130 if exec .tg == nil {
2100- // Container has no thread group assigned, so it has started yet.
2131+ if l .state == restoreFailed {
2132+ return RuntimeStateStopped
2133+ }
2134+ // Container has no thread group assigned, so it has not started yet.
21012135 return RuntimeStateCreating
21022136 }
21032137 if exec .tg .Leader ().ExitState () == kernel .TaskExitNone {
0 commit comments