Merge pull request '학습 실패 처리 수정' (#155) from feat/training_260303 into develop

Reviewed-on: #155
This commit was merged in pull request #155.
This commit is contained in:
2026-03-04 01:43:56 +09:00

View File

@@ -108,6 +108,10 @@ public class TrainJobWorker {
return;
}
/**
* 0 정상 종료 SUCCESS 1~125 학습 코드 에러 FAILED 137 OOMKill FAILED 143 SIGTERM (stop) STOP -1 우리 내부
* 강제 중단 STOP
*/
if (result.getExitCode() == 0) {
// 성공 처리
modelTrainJobCoreService.markSuccess(jobId, result.getExitCode());
@@ -124,18 +128,34 @@ public class TrainJobWorker {
}
} else {
String failMsg = result.getStatus() + "\n" + result.getLogs();
log.info("training fail Msg ={}", failMsg);
// 실패 처리
modelTrainJobCoreService.markPaused(
jobId, result.getExitCode(), result.getStatus() + "\n" + result.getLogs());
if (isEval) {
// 오류 정보 등록
modelTrainMngCoreService.markStep2Stop(modelId, "exit=" + result.getExitCode());
String failMsg = result.getStatus() + "\n" + result.getLogs();
log.info("training fail exitCode={} Msg ={}", result.getExitCode(), failMsg);
if (result.getExitCode() == -1 || result.getExitCode() == 143) {
// 실패 처리
modelTrainJobCoreService.markPaused(
jobId, result.getExitCode(), result.getStatus() + "\n" + result.getLogs());
if (isEval) {
// 오류 정보 등록
modelTrainMngCoreService.markStep2Stop(modelId, "exit=" + result.getExitCode());
} else {
// 오류 정보 등록
modelTrainMngCoreService.markStep1Stop(modelId, "exit=" + result.getExitCode());
}
} else {
// 오류 정보 등록
modelTrainMngCoreService.markStep1Stop(modelId, "exit=" + result.getExitCode());
// 실패 처리
modelTrainJobCoreService.markFailed(
jobId, result.getExitCode(), result.getStatus() + "\n" + result.getLogs());
if (isEval) {
// 오류 정보 등록
modelTrainMngCoreService.markStep2Error(modelId, "exit=" + result.getExitCode());
} else {
// 오류 정보 등록
modelTrainMngCoreService.markError(modelId, "exit=" + result.getExitCode());
}
}
}