diff --git a/src/main/java/com/kamco/cd/training/postgres/core/ModelTrainJobCoreService.java b/src/main/java/com/kamco/cd/training/postgres/core/ModelTrainJobCoreService.java index a20cfc9..4bbc9d4 100644 --- a/src/main/java/com/kamco/cd/training/postgres/core/ModelTrainJobCoreService.java +++ b/src/main/java/com/kamco/cd/training/postgres/core/ModelTrainJobCoreService.java @@ -8,9 +8,11 @@ import java.util.Map; import java.util.Objects; import java.util.Optional; import lombok.RequiredArgsConstructor; +import lombok.extern.log4j.Log4j2; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; +@Log4j2 @Service @RequiredArgsConstructor @Transactional(readOnly = true) @@ -92,6 +94,8 @@ public class ModelTrainJobCoreService { job.setExitCode(exitCode); job.setErrorMessage(errorMessage); job.setFinishedDttm(ZonedDateTime.now()); + + log.info("[TRAIN JOB FAIL] jobId={}, modelId={}", jobId, errorMessage); } /** 취소 처리 */ diff --git a/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java b/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java index 31b32cc..dd19fe5 100644 --- a/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java +++ b/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java @@ -380,7 +380,6 @@ public class DockerTrainService { synchronized (log) { logs = log.toString(); } - return new TrainRunResult(null, containerName, -1, "TIMEOUT", logs); } diff --git a/src/main/java/com/kamco/cd/training/train/service/TrainJobService.java b/src/main/java/com/kamco/cd/training/train/service/TrainJobService.java index 505e9ed..d2599c1 100644 --- a/src/main/java/com/kamco/cd/training/train/service/TrainJobService.java +++ b/src/main/java/com/kamco/cd/training/train/service/TrainJobService.java @@ -17,12 +17,14 @@ import java.util.List; import java.util.Map; import java.util.UUID; import lombok.RequiredArgsConstructor; +import lombok.extern.log4j.Log4j2; import org.springframework.beans.factory.annotation.Value; import org.springframework.context.ApplicationEventPublisher; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; @Service +@Log4j2 @RequiredArgsConstructor @Transactional(readOnly = true) public class TrainJobService { @@ -237,7 +239,24 @@ public class TrainJobService { // 학습모델을 수정한다. modelTrainMngCoreService.updateModelMaster(modelId, updateReq); } catch (IOException e) { - throw new RuntimeException(e); + log.error( + "createTmpFile failed. modelUuid={}, modelId={}, tmpRaw={}, datasetIdsSize={}, uidsSize={}", + modelUuid, + modelId, + raw, + (datasetIds == null ? null : datasetIds.size()), + (uids == null ? null : uids.size()), + e); + + // 런타임 예외로 래핑하되, 메시지에 핵심 정보 포함 + throw new IllegalStateException( + "tmp dataset build failed: modelUuid=" + + modelUuid + + ", modelId=" + + modelId + + ", tmpRaw=" + + raw, + e); } return modelUuid; } diff --git a/src/main/java/com/kamco/cd/training/train/service/TrainJobWorker.java b/src/main/java/com/kamco/cd/training/train/service/TrainJobWorker.java index 5e17a1a..e889f21 100644 --- a/src/main/java/com/kamco/cd/training/train/service/TrainJobWorker.java +++ b/src/main/java/com/kamco/cd/training/train/service/TrainJobWorker.java @@ -11,11 +11,13 @@ import com.kamco.cd.training.train.dto.TrainRunRequest; import com.kamco.cd.training.train.dto.TrainRunResult; import java.util.Map; import lombok.RequiredArgsConstructor; +import lombok.extern.log4j.Log4j2; import org.springframework.scheduling.annotation.Async; import org.springframework.stereotype.Component; import org.springframework.transaction.event.TransactionPhase; import org.springframework.transaction.event.TransactionalEventListener; +@Log4j2 @Component @RequiredArgsConstructor public class TrainJobWorker { @@ -118,7 +120,7 @@ public class TrainJobWorker { } } catch (Exception e) { - modelTrainJobCoreService.markFailed(jobId, null, e.toString()); + modelTrainJobCoreService.markFailed(jobId, null, e.getMessage()); if ("EVAL".equals(params.get("jobType"))) { modelTrainMngCoreService.markStep2Error(modelId, e.getMessage());