리커버리 수정
This commit is contained in:
@@ -115,40 +115,60 @@ public class JobRecoveryOnStartupService {
|
||||
// - 서버만 재기동됐고 컨테이너는 그대로 살아있는 케이스
|
||||
// - 실행중 docker 를 stop 하고 이어하기를 한다,
|
||||
if (state.running()) {
|
||||
log.info("[RECOVERY] container still running. container={}", containerName);
|
||||
log.warn("[RECOVERY] container still running. force kill. container={}", containerName);
|
||||
|
||||
try {
|
||||
ProcessBuilder pb = new ProcessBuilder("docker", "stop", "-t", "20", containerName);
|
||||
// ============================================================
|
||||
// 1) docker kill (SIGKILL) 바로 전송
|
||||
// - stop은 grace period가 있지만
|
||||
// - kill은 즉시 종료
|
||||
// ============================================================
|
||||
ProcessBuilder pb = new ProcessBuilder("docker", "kill", containerName);
|
||||
pb.redirectErrorStream(true);
|
||||
|
||||
Process p = pb.start();
|
||||
|
||||
boolean finished = p.waitFor(30, TimeUnit.SECONDS);
|
||||
boolean finished = p.waitFor(20, TimeUnit.SECONDS);
|
||||
if (!finished) {
|
||||
p.destroyForcibly();
|
||||
throw new IOException("docker stop timeout");
|
||||
throw new IOException("docker kill timeout");
|
||||
}
|
||||
|
||||
int code = p.exitValue();
|
||||
if (code != 0) {
|
||||
throw new IOException("docker stop failed. exit=" + code);
|
||||
throw new IOException("docker kill failed. exit=" + code);
|
||||
}
|
||||
|
||||
log.info(
|
||||
"[RECOVERY] container stopped (will be auto removed by --rm). container={}",
|
||||
containerName);
|
||||
// ============================================================
|
||||
// 2) kill 후 실제로 죽었는지 확인
|
||||
// ============================================================
|
||||
DockerInspectState after = inspectContainer(containerName);
|
||||
if (after.exists() && after.running()) {
|
||||
throw new IOException("docker kill returned 0 but container still running");
|
||||
}
|
||||
|
||||
// job 상태 변경
|
||||
modelTrainJobCoreService.markPaused(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART");
|
||||
// model 상태 변경
|
||||
markStepStopByJobType(job, "AUTO_STOP_ON_SERVER_RESTART");
|
||||
log.info("[RECOVERY] container killed successfully. container={}", containerName);
|
||||
|
||||
// ============================================================
|
||||
// 3) job 상태를 PAUSED로 변경 (서버 재기동으로 강제 중단)
|
||||
// ============================================================
|
||||
Integer modelId = job.getModelId() == null ? null : Math.toIntExact(job.getModelId());
|
||||
|
||||
modelTrainJobCoreService.markPaused(
|
||||
job.getId(), modelId, "AUTO_KILLED_ON_SERVER_RESTART");
|
||||
|
||||
markStepStopByJobType(job, "AUTO_KILLED_ON_SERVER_RESTART");
|
||||
|
||||
} catch (Exception e) {
|
||||
log.error("[RECOVERY] docker stop failed. container={}", containerName, e);
|
||||
// job 상태 변경
|
||||
modelTrainJobCoreService.markFailed(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART");
|
||||
// model 상태 변경
|
||||
markStepErrorByJobType(job, "AUTO_STOP_ON_SERVER_RESTART");
|
||||
|
||||
log.error("[RECOVERY] docker kill failed. container={}", containerName, e);
|
||||
|
||||
modelTrainJobCoreService.markFailed(
|
||||
job.getId(), -1, "AUTO_KILL_FAILED_ON_SERVER_RESTART");
|
||||
|
||||
markStepErrorByJobType(job, "AUTO_KILL_FAILED_ON_SERVER_RESTART");
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user