From 6583a45abda7454bc1077ae7ed99e733215c5a70 Mon Sep 17 00:00:00 2001 From: teddy Date: Wed, 4 Mar 2026 01:43:34 +0900 Subject: [PATCH] =?UTF-8?q?=ED=95=99=EC=8A=B5=20=EC=8B=A4=ED=8C=A8=20?= =?UTF-8?q?=EC=B2=98=EB=A6=AC=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../train/service/TrainJobWorker.java | 40 ++++++++++++++----- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/src/main/java/com/kamco/cd/training/train/service/TrainJobWorker.java b/src/main/java/com/kamco/cd/training/train/service/TrainJobWorker.java index f4256d7..985dabe 100644 --- a/src/main/java/com/kamco/cd/training/train/service/TrainJobWorker.java +++ b/src/main/java/com/kamco/cd/training/train/service/TrainJobWorker.java @@ -108,6 +108,10 @@ public class TrainJobWorker { return; } + /** + * 0 정상 종료 SUCCESS 1~125 학습 코드 에러 FAILED 137 OOMKill FAILED 143 SIGTERM (stop) STOP -1 우리 내부 + * 강제 중단 STOP + */ if (result.getExitCode() == 0) { // 성공 처리 modelTrainJobCoreService.markSuccess(jobId, result.getExitCode()); @@ -124,18 +128,34 @@ public class TrainJobWorker { } } else { - String failMsg = result.getStatus() + "\n" + result.getLogs(); - log.info("training fail Msg ={}", failMsg); - // 실패 처리 - modelTrainJobCoreService.markPaused( - jobId, result.getExitCode(), result.getStatus() + "\n" + result.getLogs()); - if (isEval) { - // 오류 정보 등록 - modelTrainMngCoreService.markStep2Stop(modelId, "exit=" + result.getExitCode()); + String failMsg = result.getStatus() + "\n" + result.getLogs(); + log.info("training fail exitCode={} Msg ={}", result.getExitCode(), failMsg); + + if (result.getExitCode() == -1 || result.getExitCode() == 143) { + // 실패 처리 + modelTrainJobCoreService.markPaused( + jobId, result.getExitCode(), result.getStatus() + "\n" + result.getLogs()); + + if (isEval) { + // 오류 정보 등록 + modelTrainMngCoreService.markStep2Stop(modelId, "exit=" + result.getExitCode()); + } else { + // 오류 정보 등록 + modelTrainMngCoreService.markStep1Stop(modelId, "exit=" + result.getExitCode()); + } } else { - // 오류 정보 등록 - modelTrainMngCoreService.markStep1Stop(modelId, "exit=" + result.getExitCode()); + // 실패 처리 + modelTrainJobCoreService.markFailed( + jobId, result.getExitCode(), result.getStatus() + "\n" + result.getLogs()); + + if (isEval) { + // 오류 정보 등록 + modelTrainMngCoreService.markStep2Error(modelId, "exit=" + result.getExitCode()); + } else { + // 오류 정보 등록 + modelTrainMngCoreService.markError(modelId, "exit=" + result.getExitCode()); + } } }