Source code for hicstuff.iteralign

#!/usr/bin/env python3
# coding: utf-8
"""Iterative alignment

Aligns iteratively reads from a 3C fastq file: reads
are trimmed with a range-sweeping number of basepairs
and each read set generated this way is mapped onto
the reference genome. This may result in a small
increase of properly mapped reads.

@author: Remi Montagne & cmdoret
"""

import os
import sys
import glob
import re
import subprocess as sp
import pysam as ps
import shutil as st
import hicstuff.io as hio
import contextlib
from hicstuff.log import logger
from os.path import join




[docs]def iterative_align( fq_in, tmp_dir, ref, n_cpu, bam_out, aligner="bowtie2", min_len=20, min_qual=30, read_len=None, ): """Iterative alignment Aligns reads iteratively reads of fq_in with bowtie2, minimap2 or bwa. Reads are truncated to the 20 first nucleotides and unmapped reads are extended by 20 nucleotides and realigned on each iteration. Parameters ---------- fq_in : str Path to input fastq file to align iteratively. tmp_dir : str Path where temporary files should be written. ref : str Path to the reference genome if Minimap2 is used for alignment. Path to the index genome if Bowtie2/bwa is used for alignment. n_cpu : int The number of CPUs to use for the iterative alignment. bam_out : str Path where the final alignment should be written in BAM format. aligner : str Choose between minimap2, bwa or bowtie2 for the alignment. min_len : int The initial length of the fragments to align. min_qual : int Minimum mapping quality required to keep Hi-C pairs. read_len : int Read length in the fasta file. If set to None, the length of the first read is used. Set this value to the longest read length in the file if you have different read lengths. Examples -------- iterative_align(fq_in='example_for.fastq', ref='example_bt2_index', bam_out='example_for.bam', aligner="bowtie2") iterative_align(fq_in='example_for.fastq', ref='example_genome.fa', bam_out='example_for.bam', aligner="minimap2") """ # set with the name of the unaligned reads : remaining_reads = set() total_reads = 0 # Store path of SAM containing aligned reads at each iteration. iter_out = [] # If there is already a file with the same name as the output file, # remove it. Otherwise, ignore. with contextlib.suppress(FileNotFoundError): try: os.remove(bam_out) except IsADirectoryError: logger.error("You need to give the BAM output file, not a folder.") raise # Bowtie only accepts uncompressed fastq: uncompress it into a temp file if aligner == "bowtie2" and hio.is_compressed(fq_in): uncomp_path = join(tmp_dir, os.path.basename(fq_in) + ".tmp") with hio.read_compressed(fq_in) as inf: with open(uncomp_path, "w") as uncomp: st.copyfileobj(inf, uncomp) else: uncomp_path = fq_in # throw error if index does not exist index = hio.check_fasta_index(ref, mode=aligner) if index is None: logger.error( f"Reference index is missing, please build the {aligner} index first." ) sys.exit(1) # Counting reads with hio.read_compressed(uncomp_path) as inf: for _ in inf: total_reads += 1 total_reads /= 4 # Use first read to guess read length if not provided. if read_len is None: with hio.read_compressed(uncomp_path) as inf: # Skip first line (read header) size = inf.readline() # Stripping newline from sequence line. read_len = len(inf.readline().rstrip()) # initial length of the fragments to align # In case reads are shorter than provided min_len if read_len > min_len: n = min_len else: logger.warning( "min_len is longer than the reads. Iterative mapping will have no effect." ) n = read_len logger.info("{0} reads to parse".format(int(total_reads))) first_round = True # iterative alignment per se while n <= read_len: logger.info( "Truncating unaligned reads to {size}bp and mapping{again}.".format( size=int(n), again="" if first_round else " again" ) ) iter_out += [join(tmp_dir, "trunc_{0}.bam".format(str(n)))] # Generate a temporary input fastq file with the n first nucleotids # of the reads. truncated_reads = truncate_reads( tmp_dir, uncomp_path, remaining_reads, n, first_round ) # Align the truncated reads on reference genome temp_alignment = join(tmp_dir, "temp_alignment.bam") map_args = { "fa": ref, "cpus": n_cpu, "fq": truncated_reads, "idx": index, "bam": temp_alignment, } if re.match(r"^(minimap[2]?|mm[2]?)$", aligner, flags=re.IGNORECASE): cmd = "minimap2 -x sr -a -t {cpus} {fa} {fq}".format(**map_args) elif re.match(r"^(bwa)$", aligner, flags=re.IGNORECASE): cmd = "bwa mem -t {cpus} -v 1 {idx} {fq}".format(**map_args) elif re.match(r"^(bowtie[2]?|bt[2]?)$", aligner, flags=re.IGNORECASE): cmd = ( "bowtie2 -x {idx} -p {cpus}" " --quiet --very-sensitive {fq}" ).format(**map_args) else: raise ValueError("Unknown aligner. Select bowtie2, minimap2 or bwa.") map_process = sp.Popen(cmd, shell=True, stdout=sp.PIPE) sort_process = sp.Popen( "samtools sort -n -@ {cpus} -O BAM -o {bam}".format(**map_args), shell=True, stdin=map_process.stdout, ) out, err = sort_process.communicate() # filter the reads: the reads whose truncated end was aligned are written # to the output file. # The reads whose truncated end was not aligned are kept for the next round. remaining_reads = filter_bamfile(temp_alignment, iter_out[-1], min_qual) n += 20 first_round = False # one last round without trimming logger.info( "Trying to map unaligned reads at full length ({0}bp).".format(int(read_len)) ) truncated_reads = truncate_reads( tmp_dir, infile=uncomp_path, unaligned_set=remaining_reads, trunc_len=n, first_round=first_round, ) if aligner == "minimap2" or aligner == "Minimap2": cmd = "minimap2 -x sr -a -t {cpus} {fa} {fq}".format( fa=ref, cpus=n_cpu, fq=truncated_reads ) elif aligner == "bwa" or aligner == "Bwa" or aligner == "BWA": cmd = "bwa mem -v 1 -t {cpus} {idx} {fq}".format( idx=index, cpus=n_cpu, fq=truncated_reads ) else: cmd = ("bowtie2 -x {idx} -p {cpus} --quiet " "--very-sensitive {fq}").format( idx=index, cpus=n_cpu, fq=truncated_reads ) map_process = sp.Popen(cmd, shell=True, stdout=sp.PIPE) # Keep reads sorted by name sort_process = sp.Popen( "samtools sort -n -@ {cpus} -O BAM -o {bam}".format( cpus=n_cpu, bam=temp_alignment ), shell=True, stdin=map_process.stdout, ) out, err = sort_process.communicate() iter_out += [join(tmp_dir, "trunc_{0}.bam".format(str(n)))] remaining_reads = filter_bamfile(temp_alignment, iter_out[-1], min_qual) # Report unaligned reads as well iter_out += [join(tmp_dir, "unaligned.bam")] temp_bam = ps.AlignmentFile(temp_alignment, "rb", check_sq=False) unmapped = ps.AlignmentFile(iter_out[-1], "wb", template=temp_bam) for r in temp_bam: # Do not write supplementary alignments (keeping 1 alignment/read) if r.query_name in remaining_reads and not r.is_supplementary: unmapped.write(r) unmapped.close() temp_bam.close() # Merge all aligned reads and unmapped reads into a single bam ps.merge("-n", "-O", "BAM", "-@", str(n_cpu), bam_out, *iter_out) logger.info( "{0} reads aligned / {1} total reads.".format( int(total_reads - len(remaining_reads)), int(total_reads) ) ) return 0
[docs]def truncate_reads(tmp_dir, infile, unaligned_set, trunc_len, first_round): """Trim read ends Writes the n first nucleotids of each sequence in infile to an auxiliary. file in the temporary folder. Parameters ---------- tmp_dir : str Path to the temporary folder. infile : str Path to the fastq file to truncate. unaligned_set : set Contains the names of all reads that did not map unambiguously in previous rounds. trunc_len : int The number of basepairs to keep in each truncated sequence. first_round : bool If this is the first round, truncate all reads without checking mapping. Returns ------- str : Path to the output fastq file containing truncated reads. """ outfile = "{0}/truncated.fastq".format(tmp_dir) with ps.FastxFile(infile, "r") as inf, open(outfile, "w") as outf: for entry in inf: # If the read did not align in previous round or this is the first round if (entry.name in unaligned_set) or first_round: entry.sequence = entry.sequence[:trunc_len] entry.quality = entry.quality[:trunc_len] outf.write(str(entry) + "\n") return outfile
[docs]def filter_bamfile(temp_alignment, filtered_out, min_qual=30): """Filter alignment BAM files Reads all the reads in the input BAM alignment file. Write reads to the output file if they are aligned with a good quality, otherwise add their name in a set to stage them for the next round of alignment. Parameters ---------- temp_alignment : str Path to the input temporary alignment. outfile : str Path to the output filtered temporary alignment. min_qual : int Minimum mapping quality required to keep a Hi-C pair. Returns ------- set: Contains the names reads that did not align. """ # Check the quality and status of each aligned fragment. # Write the ones with good quality in the final output file. # Keep those that do not map unambiguously for the next round. unaligned = set() temp_bam = ps.AlignmentFile(temp_alignment, "rb", check_sq=False) outf = ps.AlignmentFile(filtered_out, "wb", template=temp_bam) for r in temp_bam: if r.flag in [0, 16] and r.mapping_quality >= min_qual: outf.write(r) else: unaligned.add(r.query_name) logger.info("{0} reads left to map.".format(len(unaligned))) temp_bam.close() outf.close() return unaligned