Skip to content

Commit

Permalink
lib: improve conservation builder by adding checks, log messages and …
Browse files Browse the repository at this point in the history
…fixing sonnar issues, #TASK-5576, #TASK-5564
  • Loading branch information
jtarraga committed Jul 26, 2024
1 parent ddc1056 commit 8c6dc78
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -306,12 +306,20 @@ private AbstractBuilder buildProtein() throws CellBaseException {
}

private AbstractBuilder buildConservation() throws CellBaseException {
logger.info(BUILDING_LOG_MESSAGE, getDataName(CONSERVATION_DATA));

// Sanity check
Path conservationDownloadPath = downloadFolder.resolve(CONSERVATION_DATA);
Path conservationBuildPath = buildFolder.resolve(CONSERVATION_DATA);
copyVersionFiles(Arrays.asList(conservationDownloadPath.resolve(getDataVersionFilename(GERP_DATA)),
conservationDownloadPath.resolve(getDataVersionFilename(PHASTCONS_DATA)),
conservationDownloadPath.resolve(getDataVersionFilename(PHYLOP_DATA))), conservationBuildPath);

// Check and copy version files
List<String> dataList = Arrays.asList(GERP_DATA, PHASTCONS_DATA, PHYLOP_DATA);
for (String data : dataList) {
checkVersionFiles(Collections.singletonList(conservationDownloadPath.resolve(data).resolve(getDataVersionFilename(data))));
}
copyVersionFiles(Arrays.asList(conservationDownloadPath.resolve(GERP_DATA).resolve(getDataVersionFilename(GERP_DATA)),
conservationDownloadPath.resolve(PHASTCONS_DATA).resolve(getDataVersionFilename(PHASTCONS_DATA)),
conservationDownloadPath.resolve(PHYLOP_DATA).resolve(getDataVersionFilename(PHYLOP_DATA))), conservationBuildPath);

int conservationChunkSize = MongoDBCollectionConfiguration.CONSERVATION_CHUNK_SIZE;
CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(conservationBuildPath);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@
package org.opencb.cellbase.lib;

import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.core.config.Configurator;
import org.opencb.cellbase.core.config.DownloadProperties;
import org.opencb.cellbase.core.exception.CellBaseException;
import org.opencb.cellbase.lib.download.DownloadFile;
Expand Down Expand Up @@ -348,6 +346,8 @@ public final class EtlCommons {
private static Map<String, String> dataCategoriesMap = new HashMap<>();
private static Map<String, String> dataVersionFilenamesMap = new HashMap<>();

private static final Logger LOGGER = LoggerFactory.getLogger(EtlCommons.class);

static {

// Populate data names map
Expand Down Expand Up @@ -510,15 +510,9 @@ private EtlCommons() {
public static boolean runCommandLineProcess(File workingDirectory, String binPath, List<String> args, String logFilePath)
throws IOException, InterruptedException, CellBaseException {

Configurator.setRootLevel(Level.INFO);

Logger logger = LoggerFactory.getLogger("EtlCommons");

ProcessBuilder builder = getProcessBuilder(workingDirectory, binPath, args, logFilePath);

if (logger.isDebugEnabled()) {
logger.debug("Executing command: {}", StringUtils.join(builder.command(), " "));
}
LOGGER.debug("Executing command: {}", StringUtils.join(builder.command(), " "));
Process process = builder.start();
process.waitFor();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,26 +56,24 @@ public ConservationBuilder(Path conservedRegionPath, int chunkSize, CellBaseFile

@Override
public void parse() throws IOException, CellBaseException {
logger.info(BUILDING_LOG_MESSAGE, getDataName(CONSERVATION_DATA));

if (conservedRegionPath == null || !Files.exists(conservedRegionPath) || !Files.isDirectory(conservedRegionPath)) {
throw new IOException("Conservation directory " + conservedRegionPath + " does not exist or it is not a directory or it cannot"
+ " be read");
}

// Check GERP folder and files
Path gerpPath = conservedRegionPath.resolve(GERP_DATA);
DataSource dataSource = dataSourceReader.readValue(conservedRegionPath.resolve(getDataVersionFilename(GERP_DATA)).toFile());
DataSource dataSource = dataSourceReader.readValue(gerpPath.resolve(getDataVersionFilename(GERP_DATA)).toFile());
List<File> gerpFiles = checkFiles(dataSource, gerpPath, getDataName(GERP_DATA));

// Check PhastCons folder and files
Path phastConsPath = conservedRegionPath.resolve(PHASTCONS_DATA);
dataSource = dataSourceReader.readValue(conservedRegionPath.resolve(getDataVersionFilename(PHASTCONS_DATA)).toFile());
dataSource = dataSourceReader.readValue(phastConsPath.resolve(getDataVersionFilename(PHASTCONS_DATA)).toFile());
List<File> phastConsFiles = checkFiles(dataSource, phastConsPath, getDataName(PHASTCONS_DATA));

// Check PhyloP folder and files
Path phylopPath = conservedRegionPath.resolve(PHYLOP_DATA);
dataSource = dataSourceReader.readValue(conservedRegionPath.resolve(getDataVersionFilename(PHYLOP_DATA)).toFile());
dataSource = dataSourceReader.readValue(phylopPath.resolve(getDataVersionFilename(PHYLOP_DATA)).toFile());
List<File> phylopFiles = checkFiles(dataSource, phylopPath, getDataName(PHYLOP_DATA));

// GERP is downloaded from Ensembl as a bigwig file. The library we have doesn't seem to parse
Expand Down Expand Up @@ -137,8 +135,6 @@ public void parse() throws IOException, CellBaseException {
logger.debug("Processing chromosome '{}', file '{}'", chr, files.get(chr + PHYLOP_DATA));
processWigFixFile(files.get(chr + PHYLOP_DATA), PHYLOP_DATA);
}

logger.info(BUILDING_DONE_LOG_MESSAGE, getDataName(CONSERVATION_DATA));
}

private void gerpParser(Path gerpProcessFilePath) throws IOException, CellBaseException {
Expand Down Expand Up @@ -271,11 +267,10 @@ private void storeScores(int startOfBatch, String chromosome, List<Float> conser
conservationScores.clear();
}

private void processWigFixFile(Path inGzPath, String conservationSource) throws IOException {
private void processWigFixFile(Path inGzPath, String conservationSource) {
logger.info(PARSING_LOG_MESSAGE, inGzPath);
String line = null;
try (BufferedReader bufferedReader = FileUtils.newBufferedReader(inGzPath)) {

String line;
String chromosome = "";
int start = 0;
float value;
Expand Down Expand Up @@ -322,14 +317,21 @@ private void processWigFixFile(Path inGzPath, String conservationSource) throws
values.clear();
}

value = Float.parseFloat(line.trim());
try {
value = Float.parseFloat(line.trim());
} catch (NumberFormatException e) {
value = 0;
logger.warn("Invalid value: {}. Stack trace: {}", line, e.getStackTrace());
}
values.add(value);
}
}

// Write last
conservedRegion = new GenomicScoreRegion<>(chromosome, start, start + values.size() - 1, conservationSource, values);
fileSerializer.serialize(conservedRegion, getOutputFileName(chromosome));
} catch (Exception e) {
logger.error("ERROR parsing {}. Line: {}. Stack trace: {}", inGzPath, line, e.getStackTrace());
}
logger.info(PARSING_DONE_LOG_MESSAGE, inGzPath);
}
Expand All @@ -339,8 +341,11 @@ private String getOutputFileName(String chromosome) {
if (chromosome.equals("M")) {
chromosome = "MT";
}
String outputFileName = outputFileNames.get(chromosome);
if (outputFileName == null) {

String outputFileName;
if (outputFileNames.containsKey(chromosome)) {
outputFileName = outputFileNames.get(chromosome);
} else {
outputFileName = getFilename(CONSERVATION_DATA, chromosome);
outputFileNames.put(chromosome, outputFileName);
}
Expand Down

0 comments on commit 8c6dc78

Please sign in to comment.