리커버리 수정
This commit is contained in:
@@ -115,40 +115,60 @@ public class JobRecoveryOnStartupService {
|
|||||||
// - 서버만 재기동됐고 컨테이너는 그대로 살아있는 케이스
|
// - 서버만 재기동됐고 컨테이너는 그대로 살아있는 케이스
|
||||||
// - 실행중 docker 를 stop 하고 이어하기를 한다,
|
// - 실행중 docker 를 stop 하고 이어하기를 한다,
|
||||||
if (state.running()) {
|
if (state.running()) {
|
||||||
log.info("[RECOVERY] container still running. container={}", containerName);
|
log.warn("[RECOVERY] container still running. force kill. container={}", containerName);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
ProcessBuilder pb = new ProcessBuilder("docker", "stop", "-t", "20", containerName);
|
// ============================================================
|
||||||
|
// 1) docker kill (SIGKILL) 바로 전송
|
||||||
|
// - stop은 grace period가 있지만
|
||||||
|
// - kill은 즉시 종료
|
||||||
|
// ============================================================
|
||||||
|
ProcessBuilder pb = new ProcessBuilder("docker", "kill", containerName);
|
||||||
pb.redirectErrorStream(true);
|
pb.redirectErrorStream(true);
|
||||||
|
|
||||||
Process p = pb.start();
|
Process p = pb.start();
|
||||||
|
|
||||||
boolean finished = p.waitFor(30, TimeUnit.SECONDS);
|
boolean finished = p.waitFor(20, TimeUnit.SECONDS);
|
||||||
if (!finished) {
|
if (!finished) {
|
||||||
p.destroyForcibly();
|
p.destroyForcibly();
|
||||||
throw new IOException("docker stop timeout");
|
throw new IOException("docker kill timeout");
|
||||||
}
|
}
|
||||||
|
|
||||||
int code = p.exitValue();
|
int code = p.exitValue();
|
||||||
if (code != 0) {
|
if (code != 0) {
|
||||||
throw new IOException("docker stop failed. exit=" + code);
|
throw new IOException("docker kill failed. exit=" + code);
|
||||||
}
|
}
|
||||||
|
|
||||||
log.info(
|
// ============================================================
|
||||||
"[RECOVERY] container stopped (will be auto removed by --rm). container={}",
|
// 2) kill 후 실제로 죽었는지 확인
|
||||||
containerName);
|
// ============================================================
|
||||||
|
DockerInspectState after = inspectContainer(containerName);
|
||||||
|
if (after.exists() && after.running()) {
|
||||||
|
throw new IOException("docker kill returned 0 but container still running");
|
||||||
|
}
|
||||||
|
|
||||||
// job 상태 변경
|
log.info("[RECOVERY] container killed successfully. container={}", containerName);
|
||||||
modelTrainJobCoreService.markPaused(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART");
|
|
||||||
// model 상태 변경
|
// ============================================================
|
||||||
markStepStopByJobType(job, "AUTO_STOP_ON_SERVER_RESTART");
|
// 3) job 상태를 PAUSED로 변경 (서버 재기동으로 강제 중단)
|
||||||
|
// ============================================================
|
||||||
|
Integer modelId = job.getModelId() == null ? null : Math.toIntExact(job.getModelId());
|
||||||
|
|
||||||
|
modelTrainJobCoreService.markPaused(
|
||||||
|
job.getId(), modelId, "AUTO_KILLED_ON_SERVER_RESTART");
|
||||||
|
|
||||||
|
markStepStopByJobType(job, "AUTO_KILLED_ON_SERVER_RESTART");
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.error("[RECOVERY] docker stop failed. container={}", containerName, e);
|
|
||||||
// job 상태 변경
|
log.error("[RECOVERY] docker kill failed. container={}", containerName, e);
|
||||||
modelTrainJobCoreService.markFailed(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART");
|
|
||||||
// model 상태 변경
|
modelTrainJobCoreService.markFailed(
|
||||||
markStepErrorByJobType(job, "AUTO_STOP_ON_SERVER_RESTART");
|
job.getId(), -1, "AUTO_KILL_FAILED_ON_SERVER_RESTART");
|
||||||
|
|
||||||
|
markStepErrorByJobType(job, "AUTO_KILL_FAILED_ON_SERVER_RESTART");
|
||||||
}
|
}
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user