From 2f8deeacb8674813b5c7144211abfbbb3f36e8b7 Mon Sep 17 00:00:00 2001 From: Mousumy Kundu Date: Fri, 15 Nov 2024 10:08:51 -0800 Subject: [PATCH 1/4] Update variant_caller parameter --- README.md | 8 +-- SigProfilerClusters/SigProfilerClusters.py | 2 +- SigProfilerClusters/SigProfilerHotSpots.py | 7 +- SigProfilerClusters/classifyFunctions.py | 82 ++++++---------------- 4 files changed, 30 insertions(+), 69 deletions(-) diff --git a/README.md b/README.md index b2717a1..63d5bc2 100755 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ See below for a detailed list of available parameters The following parameters are used if the subClassify argument is True: includedVAFs: [boolean] Parameter that informs the tool of the inclusion of VAFs in the dataset (default=True). includedCCFs: [boolean] Parameter that informs the tool of the inclusion of CCFs in the dataset (default=True). If CCFs are used, set includedVAFs=False. - variant_caller: [string] Parameter that informs the tool of what format the VAF scores are provided (default=None).Currently, there are four supported formats: sanger, TCGA, standardVC and mutect2. + variant_caller: [string] Parameter that informs the tool of what format the VAF scores are provided (default='standard'). windowSize: [integer] Window size for calculating mutation density in the rainfall plots. By default windowSize=10000000. correction [boolean] Optional parameter to perform a genome-wide mutational density correction (boolean; default=False). probability [boolean] Optional parameter to calculate the probability of observing each clustered event within the localized region of the genome. These values are saved into the [project_path]/output/clustered/ directories. See OSF wiki page for more details. @@ -89,11 +89,9 @@ SigProfilerClusters uses the VAF recorded in the input files to subclassify clus If you are not using VCFs as input files, VAFs cannot be used in the subclassification step. Therefore, to subclassify clusters using other input file types set subclassify=True and includedVAFs=False. -If your VAF is recorded in the 11th column of your VCF as the last number of the colon delimited values, set variant_caller="sanger". +If your VAF is recorded in the 11th column of your VCF as the last number of the colon delimited values, set variant_caller="caveman". -If your VAF is recorded in the 8th column of your VCF as VCF=xx, set variant_caller="TCGA". - -If your VAF is recorded in the 8th column of your VCF as AF=xx, set variant_caller="standardVC". +If your VAF is recorded in the 8th or 10th column of your VCF as VCF=xx or AF=xx, set variant_caller="standard". If your VAF is recorded in the 11th column of your VCF as AF=xx, set variant_caller="mutect2". diff --git a/SigProfilerClusters/SigProfilerClusters.py b/SigProfilerClusters/SigProfilerClusters.py index 8e2e27e..e1c35d4 100644 --- a/SigProfilerClusters/SigProfilerClusters.py +++ b/SigProfilerClusters/SigProfilerClusters.py @@ -638,7 +638,7 @@ def analysis( chrom_based=False, max_cpu=None, subClassify=False, - variant_caller=None, + variant_caller="standard", includedVAFs=True, includedCCFs=False, windowSize=1000000, diff --git a/SigProfilerClusters/SigProfilerHotSpots.py b/SigProfilerClusters/SigProfilerHotSpots.py index 54ea5c2..e86f5b5 100644 --- a/SigProfilerClusters/SigProfilerHotSpots.py +++ b/SigProfilerClusters/SigProfilerHotSpots.py @@ -477,7 +477,7 @@ def analysis( chrom_based=False, max_cpu=None, subClassify=False, - variant_caller=None, + variant_caller="standard", includedVAFs=True, windowSize=1000000, bedRanges=None, @@ -508,9 +508,8 @@ def analysis( max_cpu -> optional parameter to specify the number of maximum cpu's to use for parallelizing the code (integer; default=None: uses all available cpu's) subClassify -> optional parameter to subclassify the clustered mutations into refinded classes including DBSs, extended MBSs, kataegis, etc. (boolean; default=False) variant_caller -> optional parameter that informs the tool of what format the VAF scores are provided (boolean; default=None). This is required when subClassify=True. Currently, there are four supported formats: - -> sanger: If your VAF is recorded in the 11th column of your VCF as the last number of the colon delimited values, set variant_caller="sanger". - -> TCGA: If your VAF is recorded in the 8th column of your VCF as VCF=xx, set variant_caller="TCGA". - -> standardVC: If your VAF is recorded in the 10th column of your VCF as AF=xx, set variant_caller="standardVC". + -> caveman: If your VAF is recorded in the 11th column of your VCF as the last number of the colon delimited values, set variant_caller="caveman". + -> standard: If your VAF is recorded in the 8th or 10th column of your VCF as VCF=xx or AF=xx, set variant_caller="standard". -> mutect2: If your VAF is recorded in the 11th column of your VCF as AF=xx, set variant_caller="mutect2". includedVAFs -> optional parameter that informs the tool of the inclusion of VAFs in the dataset (boolean; default=True) windowSize -> the size of the window used for correcting the IMDs based upon mutational density within a given genomic range (integer; default=10000000) diff --git a/SigProfilerClusters/classifyFunctions.py b/SigProfilerClusters/classifyFunctions.py index 0393c77..1a202da 100644 --- a/SigProfilerClusters/classifyFunctions.py +++ b/SigProfilerClusters/classifyFunctions.py @@ -299,7 +299,7 @@ def pullCCF(project, project_path, correction=True): print("\t".join([x for x in lines]), file=out) -def pullVaf(project, project_path, variant_caller=None, correction=True): +def pullVaf(project, project_path, variant_caller="standard", correction=True): """ Collects the VAFs from the original mutation files. Assumes that these are provided in the same format as Sanger or TCGA. @@ -308,9 +308,8 @@ def pullVaf(project, project_path, variant_caller=None, correction=True): project -> user provided project name (string) project_path -> the directory for the given project (string) variant_caller -> optional parameter that informs the tool of what format the VAF scores are provided (boolean; default=None). This is required when subClassify=True. Currently, there are four supported formats: - -> sanger: If your VAF is recorded in the 11th column of your VCF as the last number of the colon delimited values, set variant_caller="sanger". - -> TCGA: If your VAF is recorded in the 8th column of your VCF as VCF=xx, set variant_caller="TCGA". - -> standardVC: If your VAF is recorded in the 10th column of your VCF as AF=xx, set variant_caller="standardVC". + -> caveman: If your VAF is recorded in the 11th column of your VCF as the last number of the colon delimited values, set variant_caller="caveman". + -> standard: If your VAF is recorded in the 8th or 10th column of your VCF as VCF=xx or AF=xx, set variant_caller="standard". -> mutect2: If your VAF is recorded in the 11th column of your VCF as AF=xx, set variant_caller="mutect2". correction -> optional parameter to perform a genome-wide mutational density correction (boolean; default=False) @@ -342,60 +341,12 @@ def pullVaf(project, project_path, variant_caller=None, correction=True): # Dictionary for variant caller mapping variant_type_dict = { - "sanger": "sanger", - "tcga": "TCGA", + "caveman": "caveman", "standardvc": "standardVC", "mutect2": "mutect2", } - # Check if variant_caller is provided - if variant_caller is None: - raise ValueError( - "Please specify your variant caller. Currently we are supporting four different variant callers: sanger, TCGA, standardVC and mutect2." - ) - - # Map variant_caller to standardized value using variant_type_dict - if variant_caller.lower() in variant_type_dict: - variant_caller = variant_type_dict[variant_caller.lower()] - else: - raise ValueError( - f"Unsupported variant caller: {variant_caller}. Currently we are supporting four different variant callers: sanger, TCGA, standardVC and mutect2" - ) - - if variant_caller == "sanger": - vafs = {} - for vcfFile in vcf_files: - sample = vcfFile.split(".")[0] - vafs[sample] = {} - with open(vcf_path + vcfFile) as f: - for lines in f: - if lines[0] == "#": - continue - lines = lines.strip().split() - chrom = lines[0] - if len(chrom) > 3: - chrom = chrom[3:] - pos = lines[1] - ref = lines[3] - alt = lines[4] - try: - vaf = float(lines[-1].split(":")[-1]) - except: - print( - "There does not seem to be VAF scores in this Sanger-produced file.\n\t", - vcfFile, - ) - break - if len(ref) == len(alt) and len(ref) > 1: - for i in range(len(ref)): - keyLine = ":".join( - [chrom, str(int(pos) + i), ref[i], alt[i]] - ) - vafs[sample][keyLine] = vaf - else: - keyLine = ":".join([chrom, pos, ref, alt]) - vafs[sample][keyLine] = vaf - - elif variant_caller == "TCGA": + ###extracting VAF info + if variant_caller == "standard": vafs = {} for vcfFile in vcf_files: sample = vcfFile.split(".")[0] @@ -410,9 +361,15 @@ def pullVaf(project, project_path, variant_caller=None, correction=True): ref = lines[3] alt = lines[4] try: - vaf = float(lines[7].split("VAF=")[1].split(";")[0]) + if "VAF=" in lines[7]: + vaf = float(lines[7].split("VAF=")[1].split(";")[0]) + elif "AF=" in lines[9]: + vaf = float(lines[9].split("AF=")[1].split(";")[0]) + else: + vaf = -1.5 except: vaf = -1.5 + if len(ref) == len(alt) and len(ref) > 1: for i in range(len(ref)): keyLine = ":".join( @@ -423,7 +380,7 @@ def pullVaf(project, project_path, variant_caller=None, correction=True): keyLine = ":".join([chrom, pos, ref, alt]) vafs[sample][keyLine] = vaf - elif variant_caller == "standardVC": + elif variant_caller == "caveman": vafs = {} for vcfFile in vcf_files: sample = vcfFile.split(".")[0] @@ -434,13 +391,19 @@ def pullVaf(project, project_path, variant_caller=None, correction=True): continue lines = lines.strip().split() chrom = lines[0] + if len(chrom) > 3: + chrom = chrom[3:] pos = lines[1] ref = lines[3] alt = lines[4] try: - vaf = float(lines[9].split("AF=")[1].split(";")[0]) + vaf = float(lines[-1].split(":")[-1]) except: - vaf = -1.5 + print( + "There does not seem to be VAF scores in this Sanger-produced file.\n\t", + vcfFile, + ) + break if len(ref) == len(alt) and len(ref) > 1: for i in range(len(ref)): keyLine = ":".join( @@ -450,6 +413,7 @@ def pullVaf(project, project_path, variant_caller=None, correction=True): else: keyLine = ":".join([chrom, pos, ref, alt]) vafs[sample][keyLine] = vaf + elif variant_caller == "mutect2": field = "AF" vcf_col = 11 From 780209c4bd7b62aa6063b16f9151abf3661b1b40 Mon Sep 17 00:00:00 2001 From: Mousumy Kundu Date: Tue, 26 Nov 2024 11:33:52 -0800 Subject: [PATCH 2/4] Fixed the typo --- README.md | 2 +- SigProfilerClusters/SigProfilerClusters.py | 2 +- SigProfilerClusters/SigProfilerHotSpots.py | 2 +- SigProfilerClusters/classifyFunctions.py | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 63d5bc2..ac3eb2f 100755 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ If you are not using VCFs as input files, VAFs cannot be used in the subclassifi If your VAF is recorded in the 11th column of your VCF as the last number of the colon delimited values, set variant_caller="caveman". -If your VAF is recorded in the 8th or 10th column of your VCF as VCF=xx or AF=xx, set variant_caller="standard". +If your VAF is recorded in the 8th or 10th column of your VCF as VAF=xx or AF=xx, set variant_caller="standard". If your VAF is recorded in the 11th column of your VCF as AF=xx, set variant_caller="mutect2". diff --git a/SigProfilerClusters/SigProfilerClusters.py b/SigProfilerClusters/SigProfilerClusters.py index e1c35d4..8c93cfa 100644 --- a/SigProfilerClusters/SigProfilerClusters.py +++ b/SigProfilerClusters/SigProfilerClusters.py @@ -672,7 +672,7 @@ def analysis( subClassify -> optional parameter to subclassify the clustered mutations into refinded classes including DBSs, extended MBSs, kataegis, etc. (boolean; default=False) variant_caller -> optional parameter that informs the tool of what format the VAF scores are provided (boolean; default=None). This is required when subClassify=True. Currently, there are four supported formats: -> sanger: If your VAF is recorded in the 11th column of your VCF as the last number of the colon delimited values, set variant_caller="sanger". - -> TCGA: If your VAF is recorded in the 8th column of your VCF as VCF=xx, set variant_caller="TCGA". + -> TCGA: If your VAF is recorded in the 8th column of your VCF as VAF=xx, set variant_caller="TCGA". -> standardVC: If your VAF is recorded in the 10th column of your VCF as AF=xx, set variant_caller="standardVC". -> mutect2: If your VAF is recorded in the 11th column of your VCF as AF=xx, set variant_caller="mutect2". includedVAFs -> optional parameter that informs the tool of the inclusion of VAFs in the dataset (boolean; default=True) diff --git a/SigProfilerClusters/SigProfilerHotSpots.py b/SigProfilerClusters/SigProfilerHotSpots.py index e86f5b5..370276d 100644 --- a/SigProfilerClusters/SigProfilerHotSpots.py +++ b/SigProfilerClusters/SigProfilerHotSpots.py @@ -509,7 +509,7 @@ def analysis( subClassify -> optional parameter to subclassify the clustered mutations into refinded classes including DBSs, extended MBSs, kataegis, etc. (boolean; default=False) variant_caller -> optional parameter that informs the tool of what format the VAF scores are provided (boolean; default=None). This is required when subClassify=True. Currently, there are four supported formats: -> caveman: If your VAF is recorded in the 11th column of your VCF as the last number of the colon delimited values, set variant_caller="caveman". - -> standard: If your VAF is recorded in the 8th or 10th column of your VCF as VCF=xx or AF=xx, set variant_caller="standard". + -> standard: If your VAF is recorded in the 8th or 10th column of your VCF as VAF=xx or AF=xx, set variant_caller="standard". -> mutect2: If your VAF is recorded in the 11th column of your VCF as AF=xx, set variant_caller="mutect2". includedVAFs -> optional parameter that informs the tool of the inclusion of VAFs in the dataset (boolean; default=True) windowSize -> the size of the window used for correcting the IMDs based upon mutational density within a given genomic range (integer; default=10000000) diff --git a/SigProfilerClusters/classifyFunctions.py b/SigProfilerClusters/classifyFunctions.py index 1a202da..08c7a45 100644 --- a/SigProfilerClusters/classifyFunctions.py +++ b/SigProfilerClusters/classifyFunctions.py @@ -309,7 +309,7 @@ def pullVaf(project, project_path, variant_caller="standard", correction=True): project_path -> the directory for the given project (string) variant_caller -> optional parameter that informs the tool of what format the VAF scores are provided (boolean; default=None). This is required when subClassify=True. Currently, there are four supported formats: -> caveman: If your VAF is recorded in the 11th column of your VCF as the last number of the colon delimited values, set variant_caller="caveman". - -> standard: If your VAF is recorded in the 8th or 10th column of your VCF as VCF=xx or AF=xx, set variant_caller="standard". + -> standard: If your VAF is recorded in the 8th or 10th column of your VCF as VAF=xx or AF=xx, set variant_caller="standard". -> mutect2: If your VAF is recorded in the 11th column of your VCF as AF=xx, set variant_caller="mutect2". correction -> optional parameter to perform a genome-wide mutational density correction (boolean; default=False) @@ -342,7 +342,7 @@ def pullVaf(project, project_path, variant_caller="standard", correction=True): # Dictionary for variant caller mapping variant_type_dict = { "caveman": "caveman", - "standardvc": "standardVC", + "standard": "standard", "mutect2": "mutect2", } ###extracting VAF info From f8f195293d7ed20fe4990f030148624e5f8a2e08 Mon Sep 17 00:00:00 2001 From: Mousumy Kundu Date: Tue, 17 Dec 2024 11:43:22 -0800 Subject: [PATCH 3/4] Fixed calling VAFs for mutect2 --- SigProfilerClusters/classifyFunctions.py | 123 +++++++++++++++++------ 1 file changed, 92 insertions(+), 31 deletions(-) diff --git a/SigProfilerClusters/classifyFunctions.py b/SigProfilerClusters/classifyFunctions.py index 08c7a45..5b0322d 100644 --- a/SigProfilerClusters/classifyFunctions.py +++ b/SigProfilerClusters/classifyFunctions.py @@ -414,46 +414,107 @@ def pullVaf(project, project_path, variant_caller="standard", correction=True): keyLine = ":".join([chrom, pos, ref, alt]) vafs[sample][keyLine] = vaf + # elif variant_caller == "mutect2": + # field = "AF" + # vcf_col = 11 + # vafs = {} + # for vcfFile in vcf_files: + # sample = vcfFile.split(".")[0] + # vafs[sample] = {} + # with open(os.path.join(vcf_path, vcfFile)) as f: + # for lines in f: + # if lines[0] == "#": + # continue + # lines = lines.strip().split() + # chrom = lines[0] + # if chrom.startswith("chr") or chrom.startswith("Chr"): + # chrom = chrom[3:] + # pos = lines[1] + # ref = lines[3] + # alt = lines[4] + # try: + # ## Column 9 is assumed to be FORMAT + # fmt = lines[8].split(":") + # ## Extract VAF index and use it to extract VAF from the provided column + # vaf_ind = fmt.index(field) + # ## Use vcf_col-1 because humans think lists as 1-indexed but python thinks 0-indexed + # vaf = float(lines[vcf_col - 1].split(":")[vaf_ind]) + # except: + # print( + # "Provided VAF field does not match any field in VCF.\n\t", + # vcfFile, + # ) + # break + # if len(ref) == len(alt) and len(ref) > 1: + # for i in range(len(ref)): + # keyLine = ":".join( + # [chrom, str(int(pos) + i), ref[i], alt[i]] + # ) + # vafs[sample][keyLine] = vaf + # else: + # keyLine = ":".join([chrom, pos, ref, alt]) + # vafs[sample][keyLine] = vaf elif variant_caller == "mutect2": - field = "AF" - vcf_col = 11 + field = "AF" # The VAF field in the FORMAT column vafs = {} + for vcfFile in vcf_files: sample = vcfFile.split(".")[0] vafs[sample] = {} + header = [] + with open(os.path.join(vcf_path, vcfFile)) as f: - for lines in f: - if lines[0] == "#": + for line in f: + # Identify the header line with column names + if line.startswith("#") and not line.startswith("##"): + header = line.strip().split("\t") + break # Stop after finding the header + + # Check if TUMOR column exists + if "TUMOR" not in header: + print(f"TUMOR column not found in {vcfFile}. Skipping...") + continue + + # Get the index of the TUMOR column + tumor_index = header.index("TUMOR") + + # Process the data rows + for line in f: + if line.startswith("#"): continue - lines = lines.strip().split() - chrom = lines[0] - if chrom.startswith("chr") or chrom.startswith("Chr"): - chrom = chrom[3:] - pos = lines[1] - ref = lines[3] - alt = lines[4] + try: - ## Column 9 is assumed to be FORMAT - fmt = lines[8].split(":") - ## Extract VAF index and use it to extract VAF from the provided column - vaf_ind = fmt.index(field) - ## Use vcf_col-1 because humans think lists as 1-indexed but python thinks 0-indexed - vaf = float(lines[vcf_col - 1].split(":")[vaf_ind]) - except: - print( - "Provided VAF field does not match any field in VCF.\n\t", - vcfFile, - ) - break - if len(ref) == len(alt) and len(ref) > 1: - for i in range(len(ref)): - keyLine = ":".join( - [chrom, str(int(pos) + i), ref[i], alt[i]] - ) + fields = line.strip().split("\t") + chrom = fields[0] + + # Normalize chromosome naming + if chrom.lower().startswith("chr"): + chrom = chrom[3:] + + pos = fields[1] + ref = fields[3] + alt = fields[4] + + # Extract FORMAT field and TUMOR data + fmt = fields[8].split(":") + tumor_data = fields[tumor_index].split(":") + + # Get the VAF value + vaf_index = fmt.index(field) + vaf = float(tumor_data[vaf_index]) + + # Create key for the variant + if len(ref) == len(alt) and len(ref) > 1: + for i in range(len(ref)): + keyLine = f"{chrom}:{int(pos) + i}:{ref[i]}:{alt[i]}" + vafs[sample][keyLine] = vaf + else: + keyLine = f"{chrom}:{pos}:{ref}:{alt}" vafs[sample][keyLine] = vaf - else: - keyLine = ":".join([chrom, pos, ref, alt]) - vafs[sample][keyLine] = vaf + + except (ValueError, IndexError) as e: + print(f"Error processing line in {vcfFile}: {line}\n{e}") + continue with open(clusteredMutsFile) as f, open( clusteredMutsPath + project + "_clustered_vaf.txt", "w" From 6051c93d111e8694d486434dbed19f072bf4406b Mon Sep 17 00:00:00 2001 From: Mousumy Kundu Date: Tue, 17 Dec 2024 11:46:10 -0800 Subject: [PATCH 4/4] removed commented lines --- SigProfilerClusters/classifyFunctions.py | 40 ------------------------ 1 file changed, 40 deletions(-) diff --git a/SigProfilerClusters/classifyFunctions.py b/SigProfilerClusters/classifyFunctions.py index 5b0322d..7d7de00 100644 --- a/SigProfilerClusters/classifyFunctions.py +++ b/SigProfilerClusters/classifyFunctions.py @@ -414,46 +414,6 @@ def pullVaf(project, project_path, variant_caller="standard", correction=True): keyLine = ":".join([chrom, pos, ref, alt]) vafs[sample][keyLine] = vaf - # elif variant_caller == "mutect2": - # field = "AF" - # vcf_col = 11 - # vafs = {} - # for vcfFile in vcf_files: - # sample = vcfFile.split(".")[0] - # vafs[sample] = {} - # with open(os.path.join(vcf_path, vcfFile)) as f: - # for lines in f: - # if lines[0] == "#": - # continue - # lines = lines.strip().split() - # chrom = lines[0] - # if chrom.startswith("chr") or chrom.startswith("Chr"): - # chrom = chrom[3:] - # pos = lines[1] - # ref = lines[3] - # alt = lines[4] - # try: - # ## Column 9 is assumed to be FORMAT - # fmt = lines[8].split(":") - # ## Extract VAF index and use it to extract VAF from the provided column - # vaf_ind = fmt.index(field) - # ## Use vcf_col-1 because humans think lists as 1-indexed but python thinks 0-indexed - # vaf = float(lines[vcf_col - 1].split(":")[vaf_ind]) - # except: - # print( - # "Provided VAF field does not match any field in VCF.\n\t", - # vcfFile, - # ) - # break - # if len(ref) == len(alt) and len(ref) > 1: - # for i in range(len(ref)): - # keyLine = ":".join( - # [chrom, str(int(pos) + i), ref[i], alt[i]] - # ) - # vafs[sample][keyLine] = vaf - # else: - # keyLine = ":".join([chrom, pos, ref, alt]) - # vafs[sample][keyLine] = vaf elif variant_caller == "mutect2": field = "AF" # The VAF field in the FORMAT column vafs = {}