Skip to content

Commit

Permalink
Updating of the ERGA EAR tool (#1492)
Browse files Browse the repository at this point in the history
* Updated make_EAR.py script

* Added methods section to the tool as per new script update

* Updated test data

* Bump version

* Updated test

* Updated tool version
  • Loading branch information
SaimMomin12 authored Aug 30, 2024
1 parent 7b9eed0 commit e293d14
Show file tree
Hide file tree
Showing 5 changed files with 128 additions and 95 deletions.
26 changes: 24 additions & 2 deletions tools/ear/macros.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<macros>
<token name="@TOOL_VERSION@">1.0.0</token>
<token name="@VERSION_SUFFIX@">1</token>
<token name="@TOOL_VERSION@">24.08.26</token>
<token name="@VERSION_SUFFIX@">0</token>
<token name="@PROFILE@">23.2</token>
<xml name="creator">
<creator>
Expand All @@ -21,4 +21,26 @@
</citation>
</citations>
</xml>
<xml name="methods_tests">
<section name="method_data">
<repeat name="assembly_method_info">
<param name="assembly_tools_info" value="Hifiasm: 0.19.4/HiC/l0"/>
</repeat>
<repeat name="assembly_method_info">
<param name="assembly_tools_info" value="purge_dups: 1.2.6/"/>
</repeat>
<repeat name="assembly_method_info">
<param name="assembly_tools_info" value="Bionano_solve: Galaxy_3.7.0"/>
</repeat>
<repeat name="assembly_method_info">
<param name="assembly_tools_info" value="YaHS: 1.1"/>
</repeat>
<repeat name="curation_method_info">
<param name="curation_tools_info" value="GRIT_Rapid: 2.0"/>
</repeat>
<repeat name="curation_method_info">
<param name="curation_tools_info" value="HiGlass: 1.0"/>
</repeat>
</section>
</xml>
</macros>
165 changes: 79 additions & 86 deletions tools/ear/make_EAR.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@

import argparse
import glob
import logging
import math
import os
Expand All @@ -22,7 +21,7 @@
# CAUTION: This is for the Galaxy version!
# by Diego De Panis
# ERGA Sequencing and Assembly Committee
EAR_version = "v24.05.20_glxy_beta"
EAR_version = "v24.08.26"


def make_report(yaml_file):
Expand Down Expand Up @@ -120,19 +119,9 @@ def get_completeness_value(file_path, order, tool, haplotype):
fifth_column_value = target_line.split('\t')[4].strip()
return fifth_column_value
except Exception as e:
logging.warning(f"Error reading {file_path}: {str(e)}")
logging.error(f"Error reading {file_path} for tool {tool} and haplotype {haplotype}: {str(e)}")
return ''

# Getting kmer plots for curated asm
def get_png_files(dir_path):
png_files = glob.glob(f"{dir_path}/*.ln.png")
if len(png_files) < 4:
logging.warning(f"Warning: Less than 4 png files found in {dir_path}. If this is diploid, some images may be missing.")
# fill missing with None
while len(png_files) < 4:
png_files.append(None)
return png_files[:4]

# get unique part in file names
def find_unique_parts(file1, file2):
# Split filenames into parts
Expand All @@ -141,7 +130,6 @@ def find_unique_parts(file1, file2):
# Find unique parts
unique_parts1 = [part for part in parts1 if part not in parts2]
unique_parts2 = [part for part in parts2 if part not in parts1]

return ' '.join(unique_parts1), ' '.join(unique_parts2)

# extract BUSCO values
Expand Down Expand Up @@ -274,33 +262,34 @@ def generate_assembly_warnings(asm_data, gaps_per_gbp_data, obs_haploid_num):
# Parse pipeline and generate "tree"
def generate_pipeline_tree(pipeline_data):
tree_lines = []
indent = "&nbsp;" * 2 # Adjust indent spacing as needed

for tool_version_param in pipeline_data:
parts = tool_version_param.split('|')
tool_version = parts[0]
tool, version = tool_version.split('_v') if '_v' in tool_version else (tool_version, "NA")

# Handle parameters: join all but the first (which is tool_version) with ', '
param_text = ', '.join(parts[1:]) if len(parts) > 1 else "NA"

# Tool line
tool_line = f"- <b>{tool}</b>"
tree_lines.append(tool_line)

# Version line
version_line = f"{indent*2}|_ <i>ver:</i> {version}"
tree_lines.append(version_line)

# Param line(s)
if param_text != "NA":
for param in param_text.split(','):
param = param.strip()
param_line = f"{indent*2}|_ <i>key param:</i> {param if param else 'NA'}"
indent = "&nbsp;" * 2 # Adjust indent spacing

if isinstance(pipeline_data, dict):
for tool, version_param in pipeline_data.items():
# Tool line
tool_line = f"- <b>{tool}</b>"
tree_lines.append(tool_line)

# Convert version_param to string and split
version_param_str = str(version_param)
parts = version_param_str.split('/')
version = parts[0]
params = [p for p in parts[1:] if p] # This will remove empty strings

# Version line
version_line = f"{indent * 2}|_ <i>ver:</i> {version}"
tree_lines.append(version_line)

# Param line(s)
if params:
for param in params:
param_line = f"{indent * 2}|_ <i>key param:</i> {param}"
tree_lines.append(param_line)
else:
param_line = f"{indent * 2}|_ <i>key param:</i> NA"
tree_lines.append(param_line)
else:
param_line = f"{indent*2}|_ <i>key param:</i> NA"
tree_lines.append(param_line)
else:
tree_lines.append("Invalid pipeline data format")

# Join lines with HTML break for paragraph
tree_diagram = "<br/>".join(tree_lines)
Expand Down Expand Up @@ -330,10 +319,10 @@ def generate_pipeline_tree(pipeline_data):
tags = yaml_data["Tags"]

# Check if tag is valid
valid_tags = ["ERGA-BGE", "ERGA-Pilot", "ERGA-Satellite"]
valid_tags = ["ERGA-BGE", "ERGA-Pilot", "ERGA-Community", "ERGA-testing"]
if tags not in valid_tags:
tags += "[INVALID TAG]"
logging.warning("# SAMPLE INFORMATION section in the yaml file contains an invalid tag. Valid tags are ERGA-BGE, ERGA-Pilot and ERGA-Satellite")
logging.warning("# SAMPLE INFORMATION section in the yaml file contains an invalid tag. Valid tags are ERGA-BGE, ERGA-Pilot and ERGA-Community.")

# Get data from GoaT based on species name
# urllib.parse.quote to handle special characters and spaces in the species name
Expand Down Expand Up @@ -401,55 +390,62 @@ def generate_pipeline_tree(pipeline_data):
# Create a list of lists for the table
table_data = [headers, data_values]

# Extract pipeline data from 'Pre-curation' category
asm_pipeline_data = yaml_data.get('ASSEMBLIES', {}).get('Pre-curation', {}).get('pipeline', [])
asm_pipeline_tree = generate_pipeline_tree(asm_pipeline_data)
# Extract pipeline data
asm_pipeline_data = yaml_data.get('PIPELINES', {}).get('Assembly', {})
curation_pipeline_data = yaml_data.get('PIPELINES', {}).get('Curation', {})

# Extract pipeline data from 'Curated' category
curation_pipeline_data = yaml_data.get('ASSEMBLIES', {}).get('Curated', {}).get('pipeline', [])
asm_pipeline_tree = generate_pipeline_tree(asm_pipeline_data)
curation_pipeline_tree = generate_pipeline_tree(curation_pipeline_data)

# Reading GENOME PROFILING DATA section from yaml #############################################

profiling_data = yaml_data.get('PROFILING')

# Check if profiling_data is available
if not profiling_data:
logging.error('Error: No profiling data found in the YAML file.')
sys.exit(1)

# Handle GenomeScope specific processing
# Check for GenomeScope data (mandatory)
genomescope_data = profiling_data.get('GenomeScope')
if genomescope_data:
summary_file = genomescope_data.get('genomescope_summary_txt')
if summary_file and os.path.exists(summary_file):
with open(summary_file, "r") as f:
summary_txt = f.read()
genome_haploid_length = re.search(r"Genome Haploid Length\s+([\d,]+) bp", summary_txt).group(1)
proposed_ploidy_match = re.search(r"p = (\d+)", summary_txt)
proposed_ploidy = proposed_ploidy_match.group(1) if proposed_ploidy_match else 'NA'
else:
logging.error(f"File {summary_file} not found for GenomeScope.")
sys.exit(1)
else:
logging.error("GenomeScope data is missing in the PROFILING section.")
if not genomescope_data:
logging.error("Error: GenomeScope data is missing in the YAML file. This is mandatory.")
sys.exit(1)

# Handle Smudgeplot specific processing
genomescope_summary = genomescope_data.get('genomescope_summary_txt')
if not genomescope_summary:
logging.error("Error: GenomeScope summary file path is missing in the YAML file.")
sys.exit(1)

# Read the content of the GenomeScope summary file
try:
with open(genomescope_summary, "r") as f:
summary_txt = f.read()
# Extract values from summary.txt
genome_haploid_length = re.search(r"Genome Haploid Length\s+([\d,]+) bp", summary_txt).group(1)
proposed_ploidy = re.search(r"p = (\d+)", summary_txt).group(1)
except Exception as e:
logging.error(f"Error reading GenomeScope summary file: {str(e)}")
sys.exit(1)

# Check for Smudgeplot data (optional)
smudgeplot_data = profiling_data.get('Smudgeplot')
if smudgeplot_data:
verbose_summary_file = smudgeplot_data.get('smudgeplot_verbose_summary_txt')
if verbose_summary_file and os.path.exists(verbose_summary_file):
with open(verbose_summary_file, "r") as f:
smud_summary_txt = f.readlines()
for line in smud_summary_txt:
if line.startswith("* Proposed ploidy"):
proposed_ploidy = line.split(":")[1].strip()
break
smudgeplot_summary = smudgeplot_data.get('smudgeplot_verbose_summary_txt')
if smudgeplot_summary:
try:
with open(smudgeplot_summary, "r") as f:
smud_summary_txt = f.readlines()
for line in smud_summary_txt:
if line.startswith("* Proposed ploidy"):
proposed_ploidy = line.split(":")[1].strip()
break
except Exception as e:
logging.warning(f"Error reading Smudgeplot summary file: {str(e)}. Using GenomeScope ploidy.")
else:
logging.warning(f"Verbose summary file {verbose_summary_file} not found for Smudgeplot; skipping detailed Smudgeplot analysis.")
logging.warning("Smudgeplot summary file path is missing. Using GenomeScope ploidy.")
else:
logging.warning("Smudgeplot data is missing in the PROFILING section; skipping Smudgeplot analysis.")
logging.info("Smudgeplot data not provided. Using GenomeScope ploidy.")

# Reading ASSEMBLY DATA section from yaml #####################################################

Expand All @@ -459,7 +455,7 @@ def generate_pipeline_tree(pipeline_data):
asm_stages = []
for asm_stage, stage_properties in asm_data.items():
for haplotypes in stage_properties.keys():
if haplotypes != 'pipeline' and haplotypes not in asm_stages:
if haplotypes not in asm_stages:
asm_stages.append(haplotypes)

# get gfastats-based data
Expand All @@ -483,7 +479,7 @@ def generate_pipeline_tree(pipeline_data):
except (ValueError, ZeroDivisionError):
gaps_per_gbp_data[(asm_stage, haplotypes)] = ''

# Define the contigging table (column names) DON'T MOVE THIS AGAIN!!!!!!!
# Define the contigging table (column names)
asm_table_data = [["Metrics"] + [f'{asm_stage} \n {haplotypes}' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]]

# Fill the table with the gfastats data
Expand All @@ -493,16 +489,14 @@ def generate_pipeline_tree(pipeline_data):
asm_table_data.append([metric] + [format_number(gfastats_data.get((asm_stage, haplotypes), [''])[i]) if (asm_stage, haplotypes) in gfastats_data else '' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])

# Add the gaps/gbp in between
gc_index = display_names.index("GC %")
gc_index
asm_table_data.insert(gaps_index + 1, ['Gaps/Gbp'] + [format_number(gaps_per_gbp_data.get((asm_stage, haplotypes), '')) for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])

# get QV, Kmer completeness and BUSCO data
qv_data = {}
completeness_data = {}
busco_data = {metric: {} for metric in ['BUSCO sing.', 'BUSCO dupl.', 'BUSCO frag.', 'BUSCO miss.']}
for asm_stage, stage_properties in asm_data.items():
asm_stage_elements = [element for element in stage_properties.keys() if element != 'pipeline']
asm_stage_elements = list(stage_properties.keys())
for i, haplotypes in enumerate(asm_stage_elements):
haplotype_properties = stage_properties[haplotypes]
if isinstance(haplotype_properties, dict):
Expand Down Expand Up @@ -580,7 +574,7 @@ def generate_pipeline_tree(pipeline_data):
styles.add(ParagraphStyle(name='subTitleStyle', fontName='Courier', fontSize=16))
styles.add(ParagraphStyle(name='normalStyle', fontName='Courier', fontSize=12))
styles.add(ParagraphStyle(name='midiStyle', fontName='Courier', fontSize=10))
styles.add(ParagraphStyle(name='LinkStyle', fontName='Courier', fontSize=10, textColor='blue', underline=True))
# styles.add(ParagraphStyle(name='LinkStyle', fontName='Courier', fontSize=10, textColor='blue', underline=True))
styles.add(ParagraphStyle(name='treeStyle', fontName='Courier', fontSize=10, leftIndent=12))
styles.add(ParagraphStyle(name='miniStyle', fontName='Courier', fontSize=8))
styles.add(ParagraphStyle(name='FileNameStyle', fontName='Courier', fontSize=6))
Expand Down Expand Up @@ -659,7 +653,7 @@ def generate_pipeline_tree(pipeline_data):

# Iterate over haplotypes in the Curated category to get data for EBP metrics
curated_assemblies = yaml_data.get('ASSEMBLIES', {}).get('Curated', {})
haplotype_names = [key for key in curated_assemblies.keys() if key != 'pipeline']
haplotype_names = list(curated_assemblies.keys())

for haplotype in haplotype_names:
properties = curated_assemblies[haplotype]
Expand Down Expand Up @@ -756,7 +750,7 @@ def generate_pipeline_tree(pipeline_data):
# Store BUSCO version and lineage information from each file in list
busco_info_list = []
for asm_stages, stage_properties in asm_data.items():
for haplotype_keys, haplotype_properties in stage_properties.items():
for i, haplotype_properties in stage_properties.items():
if isinstance(haplotype_properties, dict):
if 'busco_short_summary_txt' in haplotype_properties:
busco_version, lineage_info = extract_busco_info(haplotype_properties['busco_short_summary_txt'])
Expand Down Expand Up @@ -787,9 +781,9 @@ def generate_pipeline_tree(pipeline_data):
tool_count = 0

# Add title and images for each step
for idx, (asm_stages, stage_properties) in enumerate(asm_data.items(), 1):
for asm_stages, stage_properties in asm_data.items():
if asm_stages == 'Curated':
tool_elements = [element for element in stage_properties.keys() if element != 'pipeline']
tool_elements = list(stage_properties.keys())

images_with_names = []

Expand Down Expand Up @@ -825,7 +819,7 @@ def generate_pipeline_tree(pipeline_data):

# Add images and names to the elements in pairs
for i in range(0, len(images_with_names), 4): # Process two images (and their names) at a time
elements_to_add = images_with_names[i:i + 4]
elements_to_add = images_with_names[i: i + 4]

# Create table for the images and names
table = Table(elements_to_add)
Expand Down Expand Up @@ -856,7 +850,6 @@ def generate_pipeline_tree(pipeline_data):

# Iterate over haplotypes in the Curated category to get K-mer spectra images
curated_assemblies = yaml_data.get('ASSEMBLIES', {}).get('Curated', {})
haplotype_names = [key for key in curated_assemblies.keys() if key != 'pipeline']

# Get paths for spectra files
spectra_files = {
Expand Down Expand Up @@ -974,9 +967,9 @@ def generate_pipeline_tree(pipeline_data):
tool_count = 0

# Add title and images for each step
for idx, (asm_stages, stage_properties) in enumerate(asm_data.items(), 1):
for asm_stages, stage_properties in asm_data.items():
if asm_stages == 'Curated': # Check if the current stage is 'Curated'
tool_elements = [element for element in stage_properties.keys() if element != 'pipeline']
tool_elements = list(stage_properties.keys())

for haplotype in tool_elements:
haplotype_properties = stage_properties[haplotype]
Expand Down
Loading

0 comments on commit e293d14

Please sign in to comment.