select-dataset-list api solarPanelCnt 추가, spotless 적용

This commit is contained in:
2026-04-02 17:44:27 +09:00
parent 1df7142544
commit 71d9835b03
10 changed files with 153 additions and 19 deletions

View File

@@ -262,9 +262,9 @@ public class DockerTrainService {
c.add("-v");
c.add(basePath + ":" + basePath); // 심볼릭 링크와 연결되는 실제 파일 경로도 마운트를 해줘야 함
c.add("-v");
c.add(symbolicDir + ":/data"); //요청할경로
c.add(symbolicDir + ":/data"); // 요청할경로
c.add("-v");
c.add(responseDir + ":/checkpoints"); //저장될경로
c.add(responseDir + ":/checkpoints"); // 저장될경로
// 표준입력 유지 (-it 대신 -i만 사용)
c.add("-i");

View File

@@ -384,7 +384,20 @@ public class JobRecoveryOnStartupService {
return new OutputResult(false, "total-epoch-missing");
}
log.info("[RECOVERY] totalEpoch={}. jobId={}", totalEpoch, job.getId());
Integer valInterval = extractValInterval(job).orElse(null);
if (valInterval == null || valInterval <= 0) {
log.warn(
"[RECOVERY] valInterval missing or invalid. jobId={}, valInterval={}",
job.getId(),
valInterval);
return new OutputResult(false, "val-interval-missing");
}
log.info(
"[RECOVERY] totalEpoch={}. valInterval={}. jobId={}",
totalEpoch,
valInterval,
job.getId());
// 3) val.csv 존재 확인
Path valCsv = outDir.resolve("val.csv");
@@ -396,14 +409,17 @@ public class JobRecoveryOnStartupService {
// 4) val.csv 라인 수 확인
long lines = countNonHeaderLines(valCsv);
// expected = 실제 val 실행 횟수
int expectedLines = totalEpoch / valInterval;
log.info(
"[RECOVERY] val.csv lines counted. jobId={}, lines={}, expected={}",
job.getId(),
lines,
totalEpoch);
expectedLines);
// 5) 완료 판정
if (lines == totalEpoch) {
if (lines >= expectedLines) {
log.info("[RECOVERY] outputs look COMPLETE. jobId={}", job.getId());
return new OutputResult(true, "ok");
}
@@ -412,7 +428,7 @@ public class JobRecoveryOnStartupService {
"[RECOVERY] val.csv line mismatch. jobId={}, lines={}, expected={}",
job.getId(),
lines,
totalEpoch);
expectedLines);
return new OutputResult(
false, "val.csv-lines-mismatch lines=" + lines + " expected=" + totalEpoch);
@@ -530,4 +546,19 @@ public class JobRecoveryOnStartupService {
return reason;
}
}
/** paramsJson에서 valInterval 추출 */
private Optional<Integer> extractValInterval(ModelTrainJobDto job) {
Map<String, Object> params = job.getParamsJson();
if (params == null) return Optional.empty();
Object v = params.get("valInterval");
if (v == null) return Optional.empty();
try {
return Optional.of(Integer.parseInt(String.valueOf(v)));
} catch (Exception ignore) {
return Optional.empty();
}
}
}

View File

@@ -19,6 +19,7 @@ public class TmpDatasetService {
@Value("${train.docker.symbolic_link_dir}")
private String symbolicDir;
/**
* train, val, test 폴더별로 link
*

View File

@@ -132,7 +132,9 @@ public class TrainJobWorker {
String failMsg = result.getStatus() + "\n" + result.getLogs();
log.info("training fail exitCode={} Msg ={}", result.getExitCode(), failMsg);
if (result.getExitCode() == -1 || result.getExitCode() == 143) {
if (result.getExitCode() == -1
|| result.getExitCode() == 143
|| result.getExitCode() == 137) {
// 실패 처리
modelTrainJobCoreService.markPaused(
jobId, result.getExitCode(), result.getStatus() + "\n" + result.getLogs());