From 6572e17f00157109d7f2bb174d4767e681874a54 Mon Sep 17 00:00:00 2001 From: teddy Date: Thu, 12 Feb 2026 10:51:15 +0900 Subject: [PATCH 1/8] =?UTF-8?q?=EC=8B=A4=ED=96=89=20=EC=98=A4=EB=A5=98=20?= =?UTF-8?q?=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../training/postgres/core/ModelTestMetricsJobCoreService.java | 2 +- .../training/postgres/core/ModelTrainMetricsJobCoreService.java | 2 +- .../postgres/repository/model/ModelMngRepositoryImpl.java | 2 +- .../{schedule => train}/ModelTestMetricsJobRepository.java | 2 +- .../ModelTestMetricsJobRepositoryCustom.java | 2 +- .../{schedule => train}/ModelTestMetricsJobRepositoryImpl.java | 2 +- .../{schedule => train}/ModelTrainMetricsJobRepository.java | 2 +- .../ModelTrainMetricsJobRepositoryCustom.java | 2 +- .../{schedule => train}/ModelTrainMetricsJobRepositoryImpl.java | 2 +- .../{schedule => train}/service/ModelTestMetricsJobService.java | 2 +- .../service/ModelTrainMetricsJobService.java | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) rename src/main/java/com/kamco/cd/training/postgres/repository/{schedule => train}/ModelTestMetricsJobRepository.java (84%) rename src/main/java/com/kamco/cd/training/postgres/repository/{schedule => train}/ModelTestMetricsJobRepositoryCustom.java (81%) rename src/main/java/com/kamco/cd/training/postgres/repository/{schedule => train}/ModelTestMetricsJobRepositoryImpl.java (97%) rename src/main/java/com/kamco/cd/training/postgres/repository/{schedule => train}/ModelTrainMetricsJobRepository.java (85%) rename src/main/java/com/kamco/cd/training/postgres/repository/{schedule => train}/ModelTrainMetricsJobRepositoryCustom.java (84%) rename src/main/java/com/kamco/cd/training/postgres/repository/{schedule => train}/ModelTrainMetricsJobRepositoryImpl.java (97%) rename src/main/java/com/kamco/cd/training/{schedule => train}/service/ModelTestMetricsJobService.java (98%) rename src/main/java/com/kamco/cd/training/{schedule => train}/service/ModelTrainMetricsJobService.java (98%) diff --git a/src/main/java/com/kamco/cd/training/postgres/core/ModelTestMetricsJobCoreService.java b/src/main/java/com/kamco/cd/training/postgres/core/ModelTestMetricsJobCoreService.java index 4e7450f..6ba1e56 100644 --- a/src/main/java/com/kamco/cd/training/postgres/core/ModelTestMetricsJobCoreService.java +++ b/src/main/java/com/kamco/cd/training/postgres/core/ModelTestMetricsJobCoreService.java @@ -1,6 +1,6 @@ package com.kamco.cd.training.postgres.core; -import com.kamco.cd.training.postgres.repository.schedule.ModelTestMetricsJobRepository; +import com.kamco.cd.training.postgres.repository.train.ModelTestMetricsJobRepository; import java.util.List; import lombok.RequiredArgsConstructor; import org.springframework.stereotype.Service; diff --git a/src/main/java/com/kamco/cd/training/postgres/core/ModelTrainMetricsJobCoreService.java b/src/main/java/com/kamco/cd/training/postgres/core/ModelTrainMetricsJobCoreService.java index 3c6c1f2..5692017 100644 --- a/src/main/java/com/kamco/cd/training/postgres/core/ModelTrainMetricsJobCoreService.java +++ b/src/main/java/com/kamco/cd/training/postgres/core/ModelTrainMetricsJobCoreService.java @@ -1,6 +1,6 @@ package com.kamco.cd.training.postgres.core; -import com.kamco.cd.training.postgres.repository.schedule.ModelTrainMetricsJobRepository; +import com.kamco.cd.training.postgres.repository.train.ModelTrainMetricsJobRepository; import java.util.List; import lombok.RequiredArgsConstructor; import org.springframework.stereotype.Service; diff --git a/src/main/java/com/kamco/cd/training/postgres/repository/model/ModelMngRepositoryImpl.java b/src/main/java/com/kamco/cd/training/postgres/repository/model/ModelMngRepositoryImpl.java index 2fe22be..c83a4ef 100644 --- a/src/main/java/com/kamco/cd/training/postgres/repository/model/ModelMngRepositoryImpl.java +++ b/src/main/java/com/kamco/cd/training/postgres/repository/model/ModelMngRepositoryImpl.java @@ -103,7 +103,7 @@ public class ModelMngRepositoryImpl implements ModelMngRepositoryCustom { modelHyperParamEntity.gpuCnt, modelHyperParamEntity.learningRate, modelHyperParamEntity.backbone, - modelHyperParamEntity.epochCnt, + modelConfigEntity.epochCount, modelHyperParamEntity.trainNumWorkers, modelHyperParamEntity.valNumWorkers, modelHyperParamEntity.testNumWorkers, diff --git a/src/main/java/com/kamco/cd/training/postgres/repository/schedule/ModelTestMetricsJobRepository.java b/src/main/java/com/kamco/cd/training/postgres/repository/train/ModelTestMetricsJobRepository.java similarity index 84% rename from src/main/java/com/kamco/cd/training/postgres/repository/schedule/ModelTestMetricsJobRepository.java rename to src/main/java/com/kamco/cd/training/postgres/repository/train/ModelTestMetricsJobRepository.java index 48f8e67..d0945fa 100644 --- a/src/main/java/com/kamco/cd/training/postgres/repository/schedule/ModelTestMetricsJobRepository.java +++ b/src/main/java/com/kamco/cd/training/postgres/repository/train/ModelTestMetricsJobRepository.java @@ -1,4 +1,4 @@ -package com.kamco.cd.training.postgres.repository.schedule; +package com.kamco.cd.training.postgres.repository.train; import com.kamco.cd.training.postgres.entity.ModelMetricsTestEntity; import org.springframework.data.jpa.repository.JpaRepository; diff --git a/src/main/java/com/kamco/cd/training/postgres/repository/schedule/ModelTestMetricsJobRepositoryCustom.java b/src/main/java/com/kamco/cd/training/postgres/repository/train/ModelTestMetricsJobRepositoryCustom.java similarity index 81% rename from src/main/java/com/kamco/cd/training/postgres/repository/schedule/ModelTestMetricsJobRepositoryCustom.java rename to src/main/java/com/kamco/cd/training/postgres/repository/train/ModelTestMetricsJobRepositoryCustom.java index 5a34eca..bd993e1 100644 --- a/src/main/java/com/kamco/cd/training/postgres/repository/schedule/ModelTestMetricsJobRepositoryCustom.java +++ b/src/main/java/com/kamco/cd/training/postgres/repository/train/ModelTestMetricsJobRepositoryCustom.java @@ -1,4 +1,4 @@ -package com.kamco.cd.training.postgres.repository.schedule; +package com.kamco.cd.training.postgres.repository.train; import java.util.List; diff --git a/src/main/java/com/kamco/cd/training/postgres/repository/schedule/ModelTestMetricsJobRepositoryImpl.java b/src/main/java/com/kamco/cd/training/postgres/repository/train/ModelTestMetricsJobRepositoryImpl.java similarity index 97% rename from src/main/java/com/kamco/cd/training/postgres/repository/schedule/ModelTestMetricsJobRepositoryImpl.java rename to src/main/java/com/kamco/cd/training/postgres/repository/train/ModelTestMetricsJobRepositoryImpl.java index d30179f..7804c52 100644 --- a/src/main/java/com/kamco/cd/training/postgres/repository/schedule/ModelTestMetricsJobRepositoryImpl.java +++ b/src/main/java/com/kamco/cd/training/postgres/repository/train/ModelTestMetricsJobRepositoryImpl.java @@ -1,4 +1,4 @@ -package com.kamco.cd.training.postgres.repository.schedule; +package com.kamco.cd.training.postgres.repository.train; import static com.kamco.cd.training.postgres.entity.QModelMasterEntity.modelMasterEntity; diff --git a/src/main/java/com/kamco/cd/training/postgres/repository/schedule/ModelTrainMetricsJobRepository.java b/src/main/java/com/kamco/cd/training/postgres/repository/train/ModelTrainMetricsJobRepository.java similarity index 85% rename from src/main/java/com/kamco/cd/training/postgres/repository/schedule/ModelTrainMetricsJobRepository.java rename to src/main/java/com/kamco/cd/training/postgres/repository/train/ModelTrainMetricsJobRepository.java index 9397e15..2b58eab 100644 --- a/src/main/java/com/kamco/cd/training/postgres/repository/schedule/ModelTrainMetricsJobRepository.java +++ b/src/main/java/com/kamco/cd/training/postgres/repository/train/ModelTrainMetricsJobRepository.java @@ -1,4 +1,4 @@ -package com.kamco.cd.training.postgres.repository.schedule; +package com.kamco.cd.training.postgres.repository.train; import com.kamco.cd.training.postgres.entity.ModelMetricsTrainEntity; import org.springframework.data.jpa.repository.JpaRepository; diff --git a/src/main/java/com/kamco/cd/training/postgres/repository/schedule/ModelTrainMetricsJobRepositoryCustom.java b/src/main/java/com/kamco/cd/training/postgres/repository/train/ModelTrainMetricsJobRepositoryCustom.java similarity index 84% rename from src/main/java/com/kamco/cd/training/postgres/repository/schedule/ModelTrainMetricsJobRepositoryCustom.java rename to src/main/java/com/kamco/cd/training/postgres/repository/train/ModelTrainMetricsJobRepositoryCustom.java index 7a8c681..a10caa8 100644 --- a/src/main/java/com/kamco/cd/training/postgres/repository/schedule/ModelTrainMetricsJobRepositoryCustom.java +++ b/src/main/java/com/kamco/cd/training/postgres/repository/train/ModelTrainMetricsJobRepositoryCustom.java @@ -1,4 +1,4 @@ -package com.kamco.cd.training.postgres.repository.schedule; +package com.kamco.cd.training.postgres.repository.train; import java.util.List; diff --git a/src/main/java/com/kamco/cd/training/postgres/repository/schedule/ModelTrainMetricsJobRepositoryImpl.java b/src/main/java/com/kamco/cd/training/postgres/repository/train/ModelTrainMetricsJobRepositoryImpl.java similarity index 97% rename from src/main/java/com/kamco/cd/training/postgres/repository/schedule/ModelTrainMetricsJobRepositoryImpl.java rename to src/main/java/com/kamco/cd/training/postgres/repository/train/ModelTrainMetricsJobRepositoryImpl.java index 78bbda6..c20bc73 100644 --- a/src/main/java/com/kamco/cd/training/postgres/repository/schedule/ModelTrainMetricsJobRepositoryImpl.java +++ b/src/main/java/com/kamco/cd/training/postgres/repository/train/ModelTrainMetricsJobRepositoryImpl.java @@ -1,4 +1,4 @@ -package com.kamco.cd.training.postgres.repository.schedule; +package com.kamco.cd.training.postgres.repository.train; import static com.kamco.cd.training.postgres.entity.QModelMasterEntity.modelMasterEntity; diff --git a/src/main/java/com/kamco/cd/training/schedule/service/ModelTestMetricsJobService.java b/src/main/java/com/kamco/cd/training/train/service/ModelTestMetricsJobService.java similarity index 98% rename from src/main/java/com/kamco/cd/training/schedule/service/ModelTestMetricsJobService.java rename to src/main/java/com/kamco/cd/training/train/service/ModelTestMetricsJobService.java index 15e5011..c5936bc 100644 --- a/src/main/java/com/kamco/cd/training/schedule/service/ModelTestMetricsJobService.java +++ b/src/main/java/com/kamco/cd/training/train/service/ModelTestMetricsJobService.java @@ -1,4 +1,4 @@ -package com.kamco.cd.training.schedule.service; +package com.kamco.cd.training.train.service; import com.kamco.cd.training.postgres.core.ModelTestMetricsJobCoreService; import java.io.BufferedReader; diff --git a/src/main/java/com/kamco/cd/training/schedule/service/ModelTrainMetricsJobService.java b/src/main/java/com/kamco/cd/training/train/service/ModelTrainMetricsJobService.java similarity index 98% rename from src/main/java/com/kamco/cd/training/schedule/service/ModelTrainMetricsJobService.java rename to src/main/java/com/kamco/cd/training/train/service/ModelTrainMetricsJobService.java index ecb7c27..319a5fd 100644 --- a/src/main/java/com/kamco/cd/training/schedule/service/ModelTrainMetricsJobService.java +++ b/src/main/java/com/kamco/cd/training/train/service/ModelTrainMetricsJobService.java @@ -1,4 +1,4 @@ -package com.kamco.cd.training.schedule.service; +package com.kamco.cd.training.train.service; import com.kamco.cd.training.postgres.core.ModelTrainMetricsJobCoreService; import java.io.BufferedReader; From c6e721aa37dd6c6752e5171f3876d8cdcd614282 Mon Sep 17 00:00:00 2001 From: teddy Date: Thu, 12 Feb 2026 10:58:12 +0900 Subject: [PATCH 2/8] =?UTF-8?q?=EC=8B=A4=ED=96=89=20=EC=98=A4=EB=A5=98=20?= =?UTF-8?q?=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../train/service/DockerTrainService.java | 53 ++++++++++++++++--- 1 file changed, 45 insertions(+), 8 deletions(-) diff --git a/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java b/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java index 4949047..2d3325b 100644 --- a/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java +++ b/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java @@ -9,6 +9,8 @@ import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import lombok.extern.log4j.Log4j2; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; @@ -57,23 +59,58 @@ public class DockerTrainService { Process p = pb.start(); // 로그는 별도 스레드에서 읽기 (메인 스레드가 readLine에 안 걸리게) - StringBuilder log = new StringBuilder(); + StringBuilder logBuilder = new StringBuilder(); + + Pattern epochPattern = Pattern.compile("(?i)\\bepoch\\s*\\[?(\\d+)\\s*/\\s*(\\d+)\\]?\\b"); + Thread logThread = new Thread( () -> { try (BufferedReader br = new BufferedReader( new InputStreamReader(p.getInputStream(), StandardCharsets.UTF_8))) { + String line; while ((line = br.readLine()) != null) { - synchronized (log) { - log.append(line).append('\n'); + + // 1) 원래 하던 로그 누적 + synchronized (logBuilder) { + logBuilder.append(line).append('\n'); + } + + // 2) 🔥 epoch 감지 + DB 업데이트 + Matcher m = epochPattern.matcher(line); + if (m.find()) { + int currentEpoch = Integer.parseInt(m.group(1)); + int totalEpoch = Integer.parseInt(m.group(2)); + + log.info("[EPOCH] container={} {}/{}", containerName, currentEpoch, totalEpoch); + + // TODO 실행중인 에폭 저장 필요하면 만들어야함 + // modelTrainMngCoreService.updateCurrentEpoch(modelId, + // currentEpoch, totalEpoch); } } - } catch (Exception ignored) { + } catch (Exception e) { + log.warn("logThread error: {}", e.toString()); } }, "train-log-" + containerName); + // new Thread( + // () -> { + // try (BufferedReader br = + // new BufferedReader( + // new InputStreamReader(p.getInputStream(), StandardCharsets.UTF_8))) { + // String line; + // while ((line = br.readLine()) != null) { + // synchronized (log) { + // log.append(line).append('\n'); + // } + // } + // } catch (Exception ignored) { + // } + // }, + // "train-log-" + containerName); logThread.setDaemon(true); logThread.start(); @@ -90,8 +127,8 @@ public class DockerTrainService { killContainer(containerName); String logs; - synchronized (log) { - logs = log.toString(); + synchronized (logBuilder) { + logs = logBuilder.toString(); } return new TrainRunResult( @@ -108,8 +145,8 @@ public class DockerTrainService { logThread.join(500); String logs; - synchronized (log) { - logs = log.toString(); + synchronized (logBuilder) { + logs = logBuilder.toString(); } return new TrainRunResult(null, containerName, exit, exit == 0 ? "SUCCESS" : "FAILED", logs); From c56c0ca6052d9a9932da1ea2805f8ffd25591ff8 Mon Sep 17 00:00:00 2001 From: teddy Date: Thu, 12 Feb 2026 10:58:26 +0900 Subject: [PATCH 3/8] =?UTF-8?q?=EC=8B=A4=ED=96=89=20=EC=98=A4=EB=A5=98=20?= =?UTF-8?q?=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../com/kamco/cd/training/train/service/DockerTrainService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java b/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java index 2d3325b..9afd6ee 100644 --- a/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java +++ b/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java @@ -78,7 +78,7 @@ public class DockerTrainService { logBuilder.append(line).append('\n'); } - // 2) 🔥 epoch 감지 + DB 업데이트 + // 2) epoch 감지 + DB 업데이트 Matcher m = epochPattern.matcher(line); if (m.find()) { int currentEpoch = Integer.parseInt(m.group(1)); From c5f19cc961953ac777c895b5d34df13611c1f11d Mon Sep 17 00:00:00 2001 From: teddy Date: Thu, 12 Feb 2026 10:58:32 +0900 Subject: [PATCH 4/8] =?UTF-8?q?=EC=8B=A4=ED=96=89=20=EC=98=A4=EB=A5=98=20?= =?UTF-8?q?=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../com/kamco/cd/training/train/service/DockerTrainService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java b/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java index 9afd6ee..55398ea 100644 --- a/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java +++ b/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java @@ -73,7 +73,7 @@ public class DockerTrainService { String line; while ((line = br.readLine()) != null) { - // 1) 원래 하던 로그 누적 + // 1) 로그 누적 synchronized (logBuilder) { logBuilder.append(line).append('\n'); } From 190b93bee8c8e2b4d1c560a9aae63d1cbb14af72 Mon Sep 17 00:00:00 2001 From: teddy Date: Thu, 12 Feb 2026 10:58:51 +0900 Subject: [PATCH 5/8] =?UTF-8?q?=EC=8B=A4=ED=96=89=20=EC=98=A4=EB=A5=98=20?= =?UTF-8?q?=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../com/kamco/cd/training/train/service/DockerTrainService.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java b/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java index 55398ea..4c3dea4 100644 --- a/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java +++ b/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java @@ -87,6 +87,7 @@ public class DockerTrainService { log.info("[EPOCH] container={} {}/{}", containerName, currentEpoch, totalEpoch); // TODO 실행중인 에폭 저장 필요하면 만들어야함 + // TODO 완료여부를 여기다가? // modelTrainMngCoreService.updateCurrentEpoch(modelId, // currentEpoch, totalEpoch); } From fd7dfd7e7f6b84f27f3173009d4598c6a1cd111e Mon Sep 17 00:00:00 2001 From: teddy Date: Thu, 12 Feb 2026 11:10:28 +0900 Subject: [PATCH 6/8] =?UTF-8?q?containerName=20=EC=83=9D=EC=84=B1=20?= =?UTF-8?q?=EB=B3=80=EA=B2=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../repository/model/ModelMngRepositoryImpl.java | 3 ++- .../kamco/cd/training/train/dto/TrainRunRequest.java | 10 ++++++++-- .../cd/training/train/service/TestJobService.java | 2 +- .../cd/training/train/service/TrainJobService.java | 1 + .../cd/training/train/service/TrainJobWorker.java | 2 +- 5 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/kamco/cd/training/postgres/repository/model/ModelMngRepositoryImpl.java b/src/main/java/com/kamco/cd/training/postgres/repository/model/ModelMngRepositoryImpl.java index c83a4ef..547d837 100644 --- a/src/main/java/com/kamco/cd/training/postgres/repository/model/ModelMngRepositoryImpl.java +++ b/src/main/java/com/kamco/cd/training/postgres/repository/model/ModelMngRepositoryImpl.java @@ -135,7 +135,8 @@ public class ModelMngRepositoryImpl implements ModelMngRepositoryCustom { modelHyperParamEntity.saturationRange, modelHyperParamEntity.hueDelta, Expressions.nullExpression(Integer.class), - Expressions.nullExpression(String.class))) + Expressions.nullExpression(String.class), + modelHyperParamEntity.uuid)) .from(modelMasterEntity) .leftJoin(modelHyperParamEntity) .on(modelHyperParamEntity.id.eq(modelMasterEntity.hyperParamId)) diff --git a/src/main/java/com/kamco/cd/training/train/dto/TrainRunRequest.java b/src/main/java/com/kamco/cd/training/train/dto/TrainRunRequest.java index 1e1974c..e294ce7 100644 --- a/src/main/java/com/kamco/cd/training/train/dto/TrainRunRequest.java +++ b/src/main/java/com/kamco/cd/training/train/dto/TrainRunRequest.java @@ -82,11 +82,17 @@ public class TrainRunRequest { private Integer timeoutSeconds; private String resumeFrom; + private UUID uuid; + public String getDatasetFolder() { - return String.valueOf(datasetFolder); + return String.valueOf(this.datasetFolder); } public String getOutputFolder() { - return String.valueOf(outputFolder); + return String.valueOf(this.outputFolder); + } + + public String getUuid() { + return String.valueOf(this.uuid); } } diff --git a/src/main/java/com/kamco/cd/training/train/service/TestJobService.java b/src/main/java/com/kamco/cd/training/train/service/TestJobService.java index e90cf1c..e7b5dfa 100644 --- a/src/main/java/com/kamco/cd/training/train/service/TestJobService.java +++ b/src/main/java/com/kamco/cd/training/train/service/TestJobService.java @@ -31,7 +31,7 @@ public class TestJobService { Map params = new java.util.LinkedHashMap<>(); params.put("jobType", "EVAL"); - params.put("uuid", uuid); + params.put("uuid", String.valueOf(uuid)); params.put("epoch", epoch); int nextAttemptNo = modelTrainJobCoreService.findMaxAttemptNo(modelId) + 1; diff --git a/src/main/java/com/kamco/cd/training/train/service/TrainJobService.java b/src/main/java/com/kamco/cd/training/train/service/TrainJobService.java index a5816b5..189f0f9 100644 --- a/src/main/java/com/kamco/cd/training/train/service/TrainJobService.java +++ b/src/main/java/com/kamco/cd/training/train/service/TrainJobService.java @@ -57,6 +57,7 @@ public class TrainJobService { @SuppressWarnings("unchecked") Map paramsMap = objectMapper.convertValue(trainRunRequest, Map.class); paramsMap.put("jobType", "TRAIN"); + paramsMap.put("uuid", trainRunRequest.getUuid()); Long jobId = modelTrainJobCoreService.createQueuedJob( diff --git a/src/main/java/com/kamco/cd/training/train/service/TrainJobWorker.java b/src/main/java/com/kamco/cd/training/train/service/TrainJobWorker.java index 2e4a094..00f8cd5 100644 --- a/src/main/java/com/kamco/cd/training/train/service/TrainJobWorker.java +++ b/src/main/java/com/kamco/cd/training/train/service/TrainJobWorker.java @@ -47,7 +47,7 @@ public class TrainJobWorker { boolean isEval = "EVAL".equals(jobType); - String containerName = (isEval ? "eval-" : "train-") + jobId; + String containerName = (isEval ? "eval-" : "train-") + jobId + "-" + params.get("uuid"); modelTrainJobCoreService.markRunning(jobId, containerName, null, "TRAIN_WORKER"); From 96035f864a2f2648311455a63cd9dd102162c2da Mon Sep 17 00:00:00 2001 From: teddy Date: Thu, 12 Feb 2026 11:42:38 +0900 Subject: [PATCH 7/8] =?UTF-8?q?containerName=20=EC=83=9D=EC=84=B1=20?= =?UTF-8?q?=EB=B3=80=EA=B2=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../postgres/core/ModelTrainJobCoreService.java | 7 ++++++- .../postgres/entity/ModelTrainJobEntity.java | 10 +++++++++- .../cd/training/train/dto/ModelTrainJobDto.java | 2 ++ .../training/train/service/DockerTrainService.java | 2 +- .../cd/training/train/service/TrainJobService.java | 1 + .../cd/training/train/service/TrainJobWorker.java | 12 ++++++++++-- 6 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/kamco/cd/training/postgres/core/ModelTrainJobCoreService.java b/src/main/java/com/kamco/cd/training/postgres/core/ModelTrainJobCoreService.java index 350a248..4a2ce5d 100644 --- a/src/main/java/com/kamco/cd/training/postgres/core/ModelTrainJobCoreService.java +++ b/src/main/java/com/kamco/cd/training/postgres/core/ModelTrainJobCoreService.java @@ -47,7 +47,8 @@ public class ModelTrainJobCoreService { /** 실행 시작 처리 */ @Transactional - public void markRunning(Long jobId, String containerName, String logPath, String lockedBy) { + public void markRunning( + Long jobId, String containerName, String logPath, String lockedBy, Integer totalEpoch) { ModelTrainJobEntity job = modelTrainJobRepository .findById(jobId) @@ -59,6 +60,10 @@ public class ModelTrainJobCoreService { job.setStartedDttm(ZonedDateTime.now()); job.setLockedDttm(ZonedDateTime.now()); job.setLockedBy(lockedBy); + + if (totalEpoch != null) { + job.setTotalEpoch(totalEpoch); + } } /** 성공 처리 */ diff --git a/src/main/java/com/kamco/cd/training/postgres/entity/ModelTrainJobEntity.java b/src/main/java/com/kamco/cd/training/postgres/entity/ModelTrainJobEntity.java index 23c11e0..4be89a8 100644 --- a/src/main/java/com/kamco/cd/training/postgres/entity/ModelTrainJobEntity.java +++ b/src/main/java/com/kamco/cd/training/postgres/entity/ModelTrainJobEntity.java @@ -78,6 +78,12 @@ public class ModelTrainJobEntity { @Column(name = "locked_by", length = 100) private String lockedBy; + @Column(name = "total_epoch") + private Integer totalEpoch; + + @Column(name = "current_epoch") + private Integer currentEpoch; + public ModelTrainJobDto toDto() { return new ModelTrainJobDto( this.id, @@ -90,6 +96,8 @@ public class ModelTrainJobEntity { this.paramsJson, this.queuedDttm, this.startedDttm, - this.finishedDttm); + this.finishedDttm, + this.totalEpoch, + this.currentEpoch); } } diff --git a/src/main/java/com/kamco/cd/training/train/dto/ModelTrainJobDto.java b/src/main/java/com/kamco/cd/training/train/dto/ModelTrainJobDto.java index f9d0004..9545ec4 100644 --- a/src/main/java/com/kamco/cd/training/train/dto/ModelTrainJobDto.java +++ b/src/main/java/com/kamco/cd/training/train/dto/ModelTrainJobDto.java @@ -20,4 +20,6 @@ public class ModelTrainJobDto { private ZonedDateTime queuedDttm; private ZonedDateTime startedDttm; private ZonedDateTime finishedDttm; + private Integer totalEpoch; + private Integer currentEpoch; } diff --git a/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java b/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java index 4c3dea4..b27e5a2 100644 --- a/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java +++ b/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java @@ -87,7 +87,7 @@ public class DockerTrainService { log.info("[EPOCH] container={} {}/{}", containerName, currentEpoch, totalEpoch); // TODO 실행중인 에폭 저장 필요하면 만들어야함 - // TODO 완료여부를 여기다가? + // TODO 하지만 여기서 트랜젝션 걸리는 db 작업하면 안좋다고하는데..? // modelTrainMngCoreService.updateCurrentEpoch(modelId, // currentEpoch, totalEpoch); } diff --git a/src/main/java/com/kamco/cd/training/train/service/TrainJobService.java b/src/main/java/com/kamco/cd/training/train/service/TrainJobService.java index 189f0f9..ab53f79 100644 --- a/src/main/java/com/kamco/cd/training/train/service/TrainJobService.java +++ b/src/main/java/com/kamco/cd/training/train/service/TrainJobService.java @@ -58,6 +58,7 @@ public class TrainJobService { Map paramsMap = objectMapper.convertValue(trainRunRequest, Map.class); paramsMap.put("jobType", "TRAIN"); paramsMap.put("uuid", trainRunRequest.getUuid()); + paramsMap.put("totalEpoch", trainRunRequest.getEpochs()); Long jobId = modelTrainJobCoreService.createQueuedJob( diff --git a/src/main/java/com/kamco/cd/training/train/service/TrainJobWorker.java b/src/main/java/com/kamco/cd/training/train/service/TrainJobWorker.java index 00f8cd5..afa2268 100644 --- a/src/main/java/com/kamco/cd/training/train/service/TrainJobWorker.java +++ b/src/main/java/com/kamco/cd/training/train/service/TrainJobWorker.java @@ -47,9 +47,17 @@ public class TrainJobWorker { boolean isEval = "EVAL".equals(jobType); - String containerName = (isEval ? "eval-" : "train-") + jobId + "-" + params.get("uuid"); + String containerName = + (isEval ? "eval-" : "train-") + jobId + "-" + params.get("uuid").toString().substring(0, 8); - modelTrainJobCoreService.markRunning(jobId, containerName, null, "TRAIN_WORKER"); + Integer totalEpoch = null; + if (params.containsKey("totalEpoch")) { + if (params.get("totalEpoch") != null) { + totalEpoch = Integer.parseInt(params.get("totalEpoch").toString()); + } + } + + modelTrainJobCoreService.markRunning(jobId, containerName, null, "TRAIN_WORKER", totalEpoch); try { TrainRunResult result; From a83bd09f8f0ff5d9186c21f96a4bad84e0738cf7 Mon Sep 17 00:00:00 2001 From: teddy Date: Thu, 12 Feb 2026 12:05:30 +0900 Subject: [PATCH 8/8] =?UTF-8?q?containerName=20=EC=83=9D=EC=84=B1=20?= =?UTF-8?q?=EB=B3=80=EA=B2=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../kamco/cd/training/train/service/DockerTrainService.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java b/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java index b27e5a2..be9486b 100644 --- a/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java +++ b/src/main/java/com/kamco/cd/training/train/service/DockerTrainService.java @@ -169,7 +169,7 @@ public class DockerTrainService { // 컨테이너 이름 지정 c.add("--name"); - c.add(containerName + "-" + req.getOutputFolder().substring(0, 8)); + c.add(containerName + "-" + req.getUuid().substring(0, 8)); // 실행 종료 시 자동 삭제 c.add("--rm"); @@ -221,7 +221,7 @@ public class DockerTrainService { c.add("/workspace/change-detection-code/train_wrapper.py"); // ===== 기본 파라미터 ===== - addArg(c, "--dataset-folder", req.getDatasetFolder()); + addArg(c, "--dataset-folder", "4BDBBDF99D04477A927CC9EBA760B845" /*req.getDatasetFolder()*/); addArg(c, "--output-folder", req.getOutputFolder()); addArg(c, "--input-size", req.getInputSize()); addArg(c, "--crop-size", req.getCropSize());