From 8c6dc78ca199e1d37922c0b00f7d581f3d387bad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 26 Jul 2024 10:06:49 +0200 Subject: [PATCH] lib: improve conservation builder by adding checks, log messages and fixing sonnar issues, #TASK-5576, #TASK-5564 --- .../admin/executors/BuildCommandExecutor.java | 14 +++++++-- .../org/opencb/cellbase/lib/EtlCommons.java | 12 ++----- .../lib/builders/ConservationBuilder.java | 31 +++++++++++-------- 3 files changed, 32 insertions(+), 25 deletions(-) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 4d324836b..22ec5971d 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -306,12 +306,20 @@ private AbstractBuilder buildProtein() throws CellBaseException { } private AbstractBuilder buildConservation() throws CellBaseException { + logger.info(BUILDING_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); + // Sanity check Path conservationDownloadPath = downloadFolder.resolve(CONSERVATION_DATA); Path conservationBuildPath = buildFolder.resolve(CONSERVATION_DATA); - copyVersionFiles(Arrays.asList(conservationDownloadPath.resolve(getDataVersionFilename(GERP_DATA)), - conservationDownloadPath.resolve(getDataVersionFilename(PHASTCONS_DATA)), - conservationDownloadPath.resolve(getDataVersionFilename(PHYLOP_DATA))), conservationBuildPath); + + // Check and copy version files + List dataList = Arrays.asList(GERP_DATA, PHASTCONS_DATA, PHYLOP_DATA); + for (String data : dataList) { + checkVersionFiles(Collections.singletonList(conservationDownloadPath.resolve(data).resolve(getDataVersionFilename(data)))); + } + copyVersionFiles(Arrays.asList(conservationDownloadPath.resolve(GERP_DATA).resolve(getDataVersionFilename(GERP_DATA)), + conservationDownloadPath.resolve(PHASTCONS_DATA).resolve(getDataVersionFilename(PHASTCONS_DATA)), + conservationDownloadPath.resolve(PHYLOP_DATA).resolve(getDataVersionFilename(PHYLOP_DATA))), conservationBuildPath); int conservationChunkSize = MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE; CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(conservationBuildPath); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 1ffebe30b..460987c6e 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -17,8 +17,6 @@ package org.opencb.cellbase.lib; import org.apache.commons.lang3.StringUtils; -import org.apache.logging.log4j.Level; -import org.apache.logging.log4j.core.config.Configurator; import org.opencb.cellbase.core.config.DownloadProperties; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.lib.download.DownloadFile; @@ -348,6 +346,8 @@ public final class EtlCommons { private static Map dataCategoriesMap = new HashMap<>(); private static Map dataVersionFilenamesMap = new HashMap<>(); + private static final Logger LOGGER = LoggerFactory.getLogger(EtlCommons.class); + static { // Populate data names map @@ -510,15 +510,9 @@ private EtlCommons() { public static boolean runCommandLineProcess(File workingDirectory, String binPath, List args, String logFilePath) throws IOException, InterruptedException, CellBaseException { - Configurator.setRootLevel(Level.INFO); - - Logger logger = LoggerFactory.getLogger("EtlCommons"); - ProcessBuilder builder = getProcessBuilder(workingDirectory, binPath, args, logFilePath); - if (logger.isDebugEnabled()) { - logger.debug("Executing command: {}", StringUtils.join(builder.command(), " ")); - } + LOGGER.debug("Executing command: {}", StringUtils.join(builder.command(), " ")); Process process = builder.start(); process.waitFor(); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java index 9f2ae630f..aadcdb6ca 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/ConservationBuilder.java @@ -56,8 +56,6 @@ public ConservationBuilder(Path conservedRegionPath, int chunkSize, CellBaseFile @Override public void parse() throws IOException, CellBaseException { - logger.info(BUILDING_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); - if (conservedRegionPath == null || !Files.exists(conservedRegionPath) || !Files.isDirectory(conservedRegionPath)) { throw new IOException("Conservation directory " + conservedRegionPath + " does not exist or it is not a directory or it cannot" + " be read"); @@ -65,17 +63,17 @@ public void parse() throws IOException, CellBaseException { // Check GERP folder and files Path gerpPath = conservedRegionPath.resolve(GERP_DATA); - DataSource dataSource = dataSourceReader.readValue(conservedRegionPath.resolve(getDataVersionFilename(GERP_DATA)).toFile()); + DataSource dataSource = dataSourceReader.readValue(gerpPath.resolve(getDataVersionFilename(GERP_DATA)).toFile()); List gerpFiles = checkFiles(dataSource, gerpPath, getDataName(GERP_DATA)); // Check PhastCons folder and files Path phastConsPath = conservedRegionPath.resolve(PHASTCONS_DATA); - dataSource = dataSourceReader.readValue(conservedRegionPath.resolve(getDataVersionFilename(PHASTCONS_DATA)).toFile()); + dataSource = dataSourceReader.readValue(phastConsPath.resolve(getDataVersionFilename(PHASTCONS_DATA)).toFile()); List phastConsFiles = checkFiles(dataSource, phastConsPath, getDataName(PHASTCONS_DATA)); // Check PhyloP folder and files Path phylopPath = conservedRegionPath.resolve(PHYLOP_DATA); - dataSource = dataSourceReader.readValue(conservedRegionPath.resolve(getDataVersionFilename(PHYLOP_DATA)).toFile()); + dataSource = dataSourceReader.readValue(phylopPath.resolve(getDataVersionFilename(PHYLOP_DATA)).toFile()); List phylopFiles = checkFiles(dataSource, phylopPath, getDataName(PHYLOP_DATA)); // GERP is downloaded from Ensembl as a bigwig file. The library we have doesn't seem to parse @@ -137,8 +135,6 @@ public void parse() throws IOException, CellBaseException { logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + PHYLOP_DATA)); processWigFixFile(files.get(chr + PHYLOP_DATA), PHYLOP_DATA); } - - logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(CONSERVATION_DATA)); } private void gerpParser(Path gerpProcessFilePath) throws IOException, CellBaseException { @@ -271,11 +267,10 @@ private void storeScores(int startOfBatch, String chromosome, List conser conservationScores.clear(); } - private void processWigFixFile(Path inGzPath, String conservationSource) throws IOException { + private void processWigFixFile(Path inGzPath, String conservationSource) { logger.info(PARSING_LOG_MESSAGE, inGzPath); + String line = null; try (BufferedReader bufferedReader = FileUtils.newBufferedReader(inGzPath)) { - - String line; String chromosome = ""; int start = 0; float value; @@ -322,7 +317,12 @@ private void processWigFixFile(Path inGzPath, String conservationSource) throws values.clear(); } - value = Float.parseFloat(line.trim()); + try { + value = Float.parseFloat(line.trim()); + } catch (NumberFormatException e) { + value = 0; + logger.warn("Invalid value: {}. Stack trace: {}", line, e.getStackTrace()); + } values.add(value); } } @@ -330,6 +330,8 @@ private void processWigFixFile(Path inGzPath, String conservationSource) throws // Write last conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, conservationSource, values); fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome)); + } catch (Exception e) { + logger.error("ERROR parsing {}. Line: {}. Stack trace: {}", inGzPath, line, e.getStackTrace()); } logger.info(PARSING_DONE_LOG_MESSAGE, inGzPath); } @@ -339,8 +341,11 @@ private String getOutputFileName(String chromosome) { if (chromosome.equals("M")) { chromosome = "MT"; } - String outputFileName = outputFileNames.get(chromosome); - if (outputFileName == null) { + + String outputFileName; + if (outputFileNames.containsKey(chromosome)) { + outputFileName = outputFileNames.get(chromosome); + } else { outputFileName = getFilename(CONSERVATION_DATA, chromosome); outputFileNames.put(chromosome, outputFileName); }