리커버리 수정 #148

Merged
teddy merged 1 commits from feat/training_260303 into develop 2026-03-04 00:22:09 +09:00

View File

@@ -115,40 +115,60 @@ public class JobRecoveryOnStartupService {
// - 서버만 재기동됐고 컨테이너는 그대로 살아있는 케이스
// - 실행중 docker 를 stop 하고 이어하기를 한다,
if (state.running()) {
log.info("[RECOVERY] container still running. container={}", containerName);
log.warn("[RECOVERY] container still running. force kill. container={}", containerName);
try {
ProcessBuilder pb = new ProcessBuilder("docker", "stop", "-t", "20", containerName);
// ============================================================
// 1) docker kill (SIGKILL) 바로 전송
// - stop은 grace period가 있지만
// - kill은 즉시 종료
// ============================================================
ProcessBuilder pb = new ProcessBuilder("docker", "kill", containerName);
pb.redirectErrorStream(true);
Process p = pb.start();
boolean finished = p.waitFor(30, TimeUnit.SECONDS);
boolean finished = p.waitFor(20, TimeUnit.SECONDS);
if (!finished) {
p.destroyForcibly();
throw new IOException("docker stop timeout");
throw new IOException("docker kill timeout");
}
int code = p.exitValue();
if (code != 0) {
throw new IOException("docker stop failed. exit=" + code);
throw new IOException("docker kill failed. exit=" + code);
}
log.info(
"[RECOVERY] container stopped (will be auto removed by --rm). container={}",
containerName);
// ============================================================
// 2) kill 후 실제로 죽었는지 확인
// ============================================================
DockerInspectState after = inspectContainer(containerName);
if (after.exists() && after.running()) {
throw new IOException("docker kill returned 0 but container still running");
}
// job 상태 변경
modelTrainJobCoreService.markPaused(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART");
// model 상태 변경
markStepStopByJobType(job, "AUTO_STOP_ON_SERVER_RESTART");
log.info("[RECOVERY] container killed successfully. container={}", containerName);
// ============================================================
// 3) job 상태를 PAUSED로 변경 (서버 재기동으로 강제 중단)
// ============================================================
Integer modelId = job.getModelId() == null ? null : Math.toIntExact(job.getModelId());
modelTrainJobCoreService.markPaused(
job.getId(), modelId, "AUTO_KILLED_ON_SERVER_RESTART");
markStepStopByJobType(job, "AUTO_KILLED_ON_SERVER_RESTART");
} catch (Exception e) {
log.error("[RECOVERY] docker stop failed. container={}", containerName, e);
// job 상태 변경
modelTrainJobCoreService.markFailed(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART");
// model 상태 변경
markStepErrorByJobType(job, "AUTO_STOP_ON_SERVER_RESTART");
log.error("[RECOVERY] docker kill failed. container={}", containerName, e);
modelTrainJobCoreService.markFailed(
job.getId(), -1, "AUTO_KILL_FAILED_ON_SERVER_RESTART");
markStepErrorByJobType(job, "AUTO_KILL_FAILED_ON_SERVER_RESTART");
}
continue;
}