리커버리 수정 #148

Merged
teddy merged 1 commits from feat/training_260303 into develop 2026-03-04 00:22:09 +09:00
Showing only changes of commit c5e03f7ca8 - Show all commits

View File

@@ -115,40 +115,60 @@ public class JobRecoveryOnStartupService {
// - 서버만 재기동됐고 컨테이너는 그대로 살아있는 케이스 // - 서버만 재기동됐고 컨테이너는 그대로 살아있는 케이스
// - 실행중 docker 를 stop 하고 이어하기를 한다, // - 실행중 docker 를 stop 하고 이어하기를 한다,
if (state.running()) { if (state.running()) {
log.info("[RECOVERY] container still running. container={}", containerName); log.warn("[RECOVERY] container still running. force kill. container={}", containerName);
try { try {
ProcessBuilder pb = new ProcessBuilder("docker", "stop", "-t", "20", containerName); // ============================================================
// 1) docker kill (SIGKILL) 바로 전송
// - stop은 grace period가 있지만
// - kill은 즉시 종료
// ============================================================
ProcessBuilder pb = new ProcessBuilder("docker", "kill", containerName);
pb.redirectErrorStream(true); pb.redirectErrorStream(true);
Process p = pb.start(); Process p = pb.start();
boolean finished = p.waitFor(30, TimeUnit.SECONDS); boolean finished = p.waitFor(20, TimeUnit.SECONDS);
if (!finished) { if (!finished) {
p.destroyForcibly(); p.destroyForcibly();
throw new IOException("docker stop timeout"); throw new IOException("docker kill timeout");
} }
int code = p.exitValue(); int code = p.exitValue();
if (code != 0) { if (code != 0) {
throw new IOException("docker stop failed. exit=" + code); throw new IOException("docker kill failed. exit=" + code);
} }
log.info( // ============================================================
"[RECOVERY] container stopped (will be auto removed by --rm). container={}", // 2) kill 후 실제로 죽었는지 확인
containerName); // ============================================================
DockerInspectState after = inspectContainer(containerName);
if (after.exists() && after.running()) {
throw new IOException("docker kill returned 0 but container still running");
}
// job 상태 변경 log.info("[RECOVERY] container killed successfully. container={}", containerName);
modelTrainJobCoreService.markPaused(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART");
// model 상태 변경 // ============================================================
markStepStopByJobType(job, "AUTO_STOP_ON_SERVER_RESTART"); // 3) job 상태를 PAUSED로 변경 (서버 재기동으로 강제 중단)
// ============================================================
Integer modelId = job.getModelId() == null ? null : Math.toIntExact(job.getModelId());
modelTrainJobCoreService.markPaused(
job.getId(), modelId, "AUTO_KILLED_ON_SERVER_RESTART");
markStepStopByJobType(job, "AUTO_KILLED_ON_SERVER_RESTART");
} catch (Exception e) { } catch (Exception e) {
log.error("[RECOVERY] docker stop failed. container={}", containerName, e);
// job 상태 변경 log.error("[RECOVERY] docker kill failed. container={}", containerName, e);
modelTrainJobCoreService.markFailed(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART");
// model 상태 변경 modelTrainJobCoreService.markFailed(
markStepErrorByJobType(job, "AUTO_STOP_ON_SERVER_RESTART"); job.getId(), -1, "AUTO_KILL_FAILED_ON_SERVER_RESTART");
markStepErrorByJobType(job, "AUTO_KILL_FAILED_ON_SERVER_RESTART");
} }
continue; continue;
} }