Merge pull request '리커버리 수정' (#147) from feat/training_260303 into develop
Reviewed-on: #147
This commit was merged in pull request #147.
This commit is contained in:
@@ -430,6 +430,44 @@ public class ModelTrainMngCoreService {
|
|||||||
master.setUpdatedDttm(ZonedDateTime.now());
|
master.setUpdatedDttm(ZonedDateTime.now());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* step1 정지 처리
|
||||||
|
*
|
||||||
|
* @param modelId
|
||||||
|
* @param errorMessage
|
||||||
|
*/
|
||||||
|
public void markStep1Stop(Long modelId, String errorMessage) {
|
||||||
|
ModelMasterEntity master =
|
||||||
|
modelMngRepository
|
||||||
|
.findById(modelId)
|
||||||
|
.orElseThrow(() -> new IllegalArgumentException("Model not found: " + modelId));
|
||||||
|
|
||||||
|
master.setStatusCd(TrainStatusType.STOPPED.getId());
|
||||||
|
master.setStep2State(TrainStatusType.STOPPED.getId());
|
||||||
|
master.setLastError(errorMessage);
|
||||||
|
master.setUpdatedUid(userUtil.getId());
|
||||||
|
master.setUpdatedDttm(ZonedDateTime.now());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* step2 정지 처리
|
||||||
|
*
|
||||||
|
* @param modelId
|
||||||
|
* @param errorMessage
|
||||||
|
*/
|
||||||
|
public void markStep2Stop(Long modelId, String errorMessage) {
|
||||||
|
ModelMasterEntity master =
|
||||||
|
modelMngRepository
|
||||||
|
.findById(modelId)
|
||||||
|
.orElseThrow(() -> new IllegalArgumentException("Model not found: " + modelId));
|
||||||
|
|
||||||
|
master.setStatusCd(TrainStatusType.STOPPED.getId());
|
||||||
|
master.setStep2State(TrainStatusType.STOPPED.getId());
|
||||||
|
master.setLastError(errorMessage);
|
||||||
|
master.setUpdatedUid(userUtil.getId());
|
||||||
|
master.setUpdatedDttm(ZonedDateTime.now());
|
||||||
|
}
|
||||||
|
|
||||||
@Transactional
|
@Transactional
|
||||||
public void markSuccess(Long modelId) {
|
public void markSuccess(Long modelId) {
|
||||||
ModelMasterEntity master =
|
ModelMasterEntity master =
|
||||||
|
|||||||
@@ -93,6 +93,7 @@ public class JobRecoveryOnStartupService {
|
|||||||
if (out.completed()) {
|
if (out.completed()) {
|
||||||
log.info("[RECOVERY] outputs look completed. mark SUCCESS. jobId={}", job.getId());
|
log.info("[RECOVERY] outputs look completed. mark SUCCESS. jobId={}", job.getId());
|
||||||
modelTrainJobCoreService.markSuccess(job.getId(), 0);
|
modelTrainJobCoreService.markSuccess(job.getId(), 0);
|
||||||
|
// model 상태 변경
|
||||||
markStepSuccessByJobType(job);
|
markStepSuccessByJobType(job);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
@@ -104,7 +105,7 @@ public class JobRecoveryOnStartupService {
|
|||||||
|
|
||||||
modelTrainJobCoreService.markFailed(
|
modelTrainJobCoreService.markFailed(
|
||||||
job.getId(), -1, "SERVER_RESTART_CONTAINER_MISSING_OUTPUT_INCOMPLETE");
|
job.getId(), -1, "SERVER_RESTART_CONTAINER_MISSING_OUTPUT_INCOMPLETE");
|
||||||
|
// model 상태 변경
|
||||||
markStepErrorByJobType(job, out.reason());
|
markStepErrorByJobType(job, out.reason());
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
@@ -136,13 +137,17 @@ public class JobRecoveryOnStartupService {
|
|||||||
"[RECOVERY] container stopped (will be auto removed by --rm). container={}",
|
"[RECOVERY] container stopped (will be auto removed by --rm). container={}",
|
||||||
containerName);
|
containerName);
|
||||||
|
|
||||||
// 여기서 상태를 PAUSED로 바꿔도 되고
|
// job 상태 변경
|
||||||
modelTrainJobCoreService.markPaused(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART");
|
modelTrainJobCoreService.markPaused(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART");
|
||||||
|
// model 상태 변경
|
||||||
|
markStepStopByJobType(job, "AUTO_STOP_ON_SERVER_RESTART");
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.error("[RECOVERY] docker stop failed. container={}", containerName, e);
|
log.error("[RECOVERY] docker stop failed. container={}", containerName, e);
|
||||||
|
// job 상태 변경
|
||||||
modelTrainJobCoreService.markFailed(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART");
|
modelTrainJobCoreService.markFailed(job.getId(), -1, "AUTO_STOP_FAILED_ON_RESTART");
|
||||||
|
// model 상태 변경
|
||||||
|
markStepErrorByJobType(job, "AUTO_STOP_ON_SERVER_RESTART");
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -156,6 +161,7 @@ public class JobRecoveryOnStartupService {
|
|||||||
if (exitCode != null && exitCode == 0) {
|
if (exitCode != null && exitCode == 0) {
|
||||||
log.info("[RECOVERY] container exited(0). mark SUCCESS. container={}", containerName);
|
log.info("[RECOVERY] container exited(0). mark SUCCESS. container={}", containerName);
|
||||||
modelTrainJobCoreService.markSuccess(job.getId(), 0);
|
modelTrainJobCoreService.markSuccess(job.getId(), 0);
|
||||||
|
// model 상태 변경
|
||||||
markStepSuccessByJobType(job);
|
markStepSuccessByJobType(job);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
@@ -168,7 +174,7 @@ public class JobRecoveryOnStartupService {
|
|||||||
|
|
||||||
modelTrainJobCoreService.markFailed(
|
modelTrainJobCoreService.markFailed(
|
||||||
job.getId(), exitCode, "SERVER_RESTART_CONTAINER_EXIT_NONZERO");
|
job.getId(), exitCode, "SERVER_RESTART_CONTAINER_EXIT_NONZERO");
|
||||||
|
// model 상태 변경
|
||||||
markStepErrorByJobType(job, "exit=" + exitCode + " status=" + status);
|
markStepErrorByJobType(job, "exit=" + exitCode + " status=" + status);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -180,7 +186,7 @@ public class JobRecoveryOnStartupService {
|
|||||||
|
|
||||||
modelTrainJobCoreService.markFailed(
|
modelTrainJobCoreService.markFailed(
|
||||||
job.getId(), -1, "SERVER_RESTART_CONTAINER_INSPECT_ERROR");
|
job.getId(), -1, "SERVER_RESTART_CONTAINER_INSPECT_ERROR");
|
||||||
|
// model 상태 변경
|
||||||
markStepErrorByJobType(job, "inspect-error");
|
markStepErrorByJobType(job, "inspect-error");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -206,6 +212,16 @@ public class JobRecoveryOnStartupService {
|
|||||||
*
|
*
|
||||||
* <p>예: - jobType == "EVAL" → step2(평가 단계) 에러 - 그 외 → step1 혹은 전체 에러
|
* <p>예: - jobType == "EVAL" → step2(평가 단계) 에러 - 그 외 → step1 혹은 전체 에러
|
||||||
*/
|
*/
|
||||||
|
private void markStepStopByJobType(ModelTrainJobDto job, String msg) {
|
||||||
|
Map<String, Object> params = job.getParamsJson();
|
||||||
|
boolean isEval = params != null && "EVAL".equals(String.valueOf(params.get("jobType")));
|
||||||
|
if (isEval) {
|
||||||
|
modelTrainMngCoreService.markStep2Stop(job.getModelId(), msg);
|
||||||
|
} else {
|
||||||
|
modelTrainMngCoreService.markStep1Stop(job.getModelId(), msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private void markStepErrorByJobType(ModelTrainJobDto job, String msg) {
|
private void markStepErrorByJobType(ModelTrainJobDto job, String msg) {
|
||||||
Map<String, Object> params = job.getParamsJson();
|
Map<String, Object> params = job.getParamsJson();
|
||||||
boolean isEval = params != null && "EVAL".equals(String.valueOf(params.get("jobType")));
|
boolean isEval = params != null && "EVAL".equals(String.valueOf(params.get("jobType")));
|
||||||
|
|||||||
Reference in New Issue
Block a user