From c5e03f7ca87b872eee13c2248f70f79c72a0208c Mon Sep 17 00:00:00 2001 From: teddy Date: Wed, 4 Mar 2026 00:21:51 +0900 Subject: [PATCH] =?UTF-8?q?=EB=A6=AC=EC=BB=A4=EB=B2=84=EB=A6=AC=20?= =?UTF-8?q?=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../service/JobRecoveryOnStartupService.java | 54 +++++++++++++------ 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java b/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java index cef50ab..9be8ec3 100644 --- a/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java +++ b/src/main/java/com/kamco/cd/training/train/service/JobRecoveryOnStartupService.java @@ -115,40 +115,60 @@ public class JobRecoveryOnStartupService { // - 서버만 재기동됐고 컨테이너는 그대로 살아있는 케이스 // - 실행중 docker 를 stop 하고 이어하기를 한다, if (state.running()) { - log.info("[RECOVERY] container still running. container={}", containerName); + log.warn("[RECOVERY] container still running. force kill. container={}", containerName); + try { - ProcessBuilder pb = new ProcessBuilder("docker", "stop", "-t", "20", containerName); + // ============================================================ + // 1) docker kill (SIGKILL) 바로 전송 + // - stop은 grace period가 있지만 + // - kill은 즉시 종료 + // ============================================================ + ProcessBuilder pb = new ProcessBuilder("docker", "kill", containerName); pb.redirectErrorStream(true); Process p = pb.start(); - boolean finished = p.waitFor(30, TimeUnit.SECONDS); + boolean finished = p.waitFor(20, TimeUnit.SECONDS); if (!finished) { p.destroyForcibly(); - throw new IOException("docker stop timeout"); + throw new IOException("docker kill timeout"); } int code = p.exitValue(); if (code != 0) { - throw new IOException("docker stop failed. exit=" + code); + throw new IOException("docker kill failed. exit=" + code); } - log.info( - "[RECOVERY] container stopped (will be auto removed by --rm). container={}", - containerName); + // ============================================================ + // 2) kill 후 실제로 죽었는지 확인 + // ============================================================ + DockerInspectState after = inspectContainer(containerName); + if (after.exists() && after.running()) { + throw new IOException("docker kill returned 0 but container still running"); + } - // job 상태 변경 - modelTrainJobCoreService.markPaused(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART"); - // model 상태 변경 - markStepStopByJobType(job, "AUTO_STOP_ON_SERVER_RESTART"); + log.info("[RECOVERY] container killed successfully. container={}", containerName); + + // ============================================================ + // 3) job 상태를 PAUSED로 변경 (서버 재기동으로 강제 중단) + // ============================================================ + Integer modelId = job.getModelId() == null ? null : Math.toIntExact(job.getModelId()); + + modelTrainJobCoreService.markPaused( + job.getId(), modelId, "AUTO_KILLED_ON_SERVER_RESTART"); + + markStepStopByJobType(job, "AUTO_KILLED_ON_SERVER_RESTART"); } catch (Exception e) { - log.error("[RECOVERY] docker stop failed. container={}", containerName, e); - // job 상태 변경 - modelTrainJobCoreService.markFailed(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART"); - // model 상태 변경 - markStepErrorByJobType(job, "AUTO_STOP_ON_SERVER_RESTART"); + + log.error("[RECOVERY] docker kill failed. container={}", containerName, e); + + modelTrainJobCoreService.markFailed( + job.getId(), -1, "AUTO_KILL_FAILED_ON_SERVER_RESTART"); + + markStepErrorByJobType(job, "AUTO_KILL_FAILED_ON_SERVER_RESTART"); } + continue; }