Source code for hicstuff.iteralign

#!/usr/bin/env python3
# coding: utf-8
"""Iterative alignment

Aligns iteratively reads from a 3C fastq file: reads
are trimmed with a range-sweeping number of basepairs
and each read set generated this way is mapped onto
the reference genome. This may result in a small
increase of properly mapped reads.

@author: Remi Montagne & cmdoret
"""

import os
import sys
import glob
import re
import subprocess as sp
import pysam as ps
import shutil as st
import hicstuff.io as hio
import contextlib
from hicstuff.log import logger
from os.path import join




[docs]def iterative_align(
    fq_in,
    tmp_dir,
    ref,
    n_cpu,
    bam_out,
    aligner="bowtie2",
    min_len=20,
    min_qual=30,
    read_len=None,
):
    """Iterative alignment

    Aligns reads iteratively reads of fq_in with bowtie2, minimap2 or bwa. Reads are
    truncated to the 20 first nucleotides and unmapped reads are extended by 20
    nucleotides and realigned on each iteration.

    Parameters
    ----------

    fq_in : str
        Path to input fastq file to align iteratively.
    tmp_dir : str
        Path where temporary files should be written.
    ref : str
        Path to the reference genome if Minimap2 is used for alignment.
        Path to the index genome if Bowtie2/bwa is used for alignment. 
    n_cpu : int
        The number of CPUs to use for the iterative alignment.
    bam_out : str
        Path where the final alignment should be written in BAM format.
    aligner : str
        Choose between minimap2, bwa or bowtie2 for the alignment.
    min_len : int
        The initial length of the fragments to align.
    min_qual : int
        Minimum mapping quality required to keep Hi-C pairs.
    read_len : int
        Read length in the fasta file. If set to None, the length of the first read
        is used. Set this value to the longest read length in the file if you have
        different read lengths.
        
    Examples
    --------

    iterative_align(fq_in='example_for.fastq', ref='example_bt2_index', bam_out='example_for.bam', aligner="bowtie2")
    iterative_align(fq_in='example_for.fastq', ref='example_genome.fa', bam_out='example_for.bam', aligner="minimap2")
    """
    # set with the name of the unaligned reads :
    remaining_reads = set()
    total_reads = 0
    # Store path of SAM containing aligned reads at each iteration.
    iter_out = []

    # If there is already a file with the same name as the output file,
    # remove it. Otherwise, ignore.
    with contextlib.suppress(FileNotFoundError):
        try:
            os.remove(bam_out)
        except IsADirectoryError:
            logger.error("You need to give the BAM output file, not a folder.")
            raise

    # Bowtie only accepts uncompressed fastq: uncompress it into a temp file
    if aligner == "bowtie2" and hio.is_compressed(fq_in):
        uncomp_path = join(tmp_dir, os.path.basename(fq_in) + ".tmp")
        with hio.read_compressed(fq_in) as inf:
            with open(uncomp_path, "w") as uncomp:
                st.copyfileobj(inf, uncomp)
    else:
        uncomp_path = fq_in

    # throw error if index does not exist
    index = hio.check_fasta_index(ref, mode=aligner)
    if index is None:
        logger.error(
            f"Reference index is missing, please build the {aligner} index first." 
        )
        sys.exit(1)
    # Counting reads
    with hio.read_compressed(uncomp_path) as inf:
        for _ in inf:
            total_reads += 1
    total_reads /= 4

    # Use first read to guess read length if not provided.
    if read_len is None:
        with hio.read_compressed(uncomp_path) as inf:
            # Skip first line (read header)
            size = inf.readline()
            # Stripping newline from sequence line.
            read_len = len(inf.readline().rstrip())

    # initial length of the fragments to align
    # In case reads are shorter than provided min_len
    if read_len > min_len:
        n = min_len
    else:
        logger.warning(
            "min_len is longer than the reads. Iterative mapping will have no effect."
        )
        n = read_len
    logger.info("{0} reads to parse".format(int(total_reads)))

    first_round = True
    # iterative alignment per se
    while n <= read_len:
        logger.info(
            "Truncating unaligned reads to {size}bp and mapping{again}.".format(
                size=int(n), again="" if first_round else " again"
            )
        )
        iter_out += [join(tmp_dir, "trunc_{0}.bam".format(str(n)))]
        # Generate a temporary input fastq file with the n first nucleotids
        # of the reads.
        truncated_reads = truncate_reads(
            tmp_dir, uncomp_path, remaining_reads, n, first_round
        )

        # Align the truncated reads on reference genome
        temp_alignment = join(tmp_dir, "temp_alignment.bam")
        map_args = {
            "fa": ref,
            "cpus": n_cpu,
            "fq": truncated_reads,
            "idx": index,
            "bam": temp_alignment,
        }
        if re.match(r"^(minimap[2]?|mm[2]?)$", aligner, flags=re.IGNORECASE):
            cmd = "minimap2 -x sr -a -t {cpus} {fa} {fq}".format(**map_args)
        elif re.match(r"^(bwa)$", aligner, flags=re.IGNORECASE):
            cmd = "bwa mem -t {cpus} -v 1 {idx} {fq}".format(**map_args)
        elif re.match(r"^(bowtie[2]?|bt[2]?)$", aligner, flags=re.IGNORECASE):
            cmd = (
                "bowtie2 -x {idx} -p {cpus}" " --quiet --very-sensitive {fq}"
            ).format(**map_args)
        else:
            raise ValueError("Unknown aligner. Select bowtie2, minimap2 or bwa.")

        map_process = sp.Popen(cmd, shell=True, stdout=sp.PIPE)
        sort_process = sp.Popen(
            "samtools sort -n -@ {cpus} -O BAM -o {bam}".format(**map_args),
            shell=True,
            stdin=map_process.stdout,
        )
        out, err = sort_process.communicate()

        # filter the reads: the reads whose truncated end was aligned are written
        # to the output file.
        # The reads whose truncated end was not aligned are kept for the next round.
        remaining_reads = filter_bamfile(temp_alignment, iter_out[-1], min_qual)

        n += 20
        first_round = False

    # one last round without trimming
    logger.info(
        "Trying to map unaligned reads at full length ({0}bp).".format(int(read_len))
    )

    truncated_reads = truncate_reads(
        tmp_dir,
        infile=uncomp_path,
        unaligned_set=remaining_reads,
        trunc_len=n,
        first_round=first_round,
    )
    if aligner == "minimap2" or aligner == "Minimap2":
        cmd = "minimap2 -x sr -a -t {cpus} {fa} {fq}".format(
            fa=ref, cpus=n_cpu, fq=truncated_reads
        )
    elif aligner == "bwa" or aligner == "Bwa" or aligner == "BWA":
        cmd = "bwa mem -v 1 -t {cpus} {idx} {fq}".format(
            idx=index, cpus=n_cpu, fq=truncated_reads
        )
    else:
        cmd = ("bowtie2 -x {idx} -p {cpus} --quiet " "--very-sensitive {fq}").format(
            idx=index, cpus=n_cpu, fq=truncated_reads
        )
    map_process = sp.Popen(cmd, shell=True, stdout=sp.PIPE)
    # Keep reads sorted by name
    sort_process = sp.Popen(
        "samtools sort -n -@ {cpus} -O BAM -o {bam}".format(
            cpus=n_cpu, bam=temp_alignment
        ),
        shell=True,
        stdin=map_process.stdout,
    )
    out, err = sort_process.communicate()
    iter_out += [join(tmp_dir, "trunc_{0}.bam".format(str(n)))]
    remaining_reads = filter_bamfile(temp_alignment, iter_out[-1], min_qual)

    # Report unaligned reads as well
    iter_out += [join(tmp_dir, "unaligned.bam")]
    temp_bam = ps.AlignmentFile(temp_alignment, "rb", check_sq=False)
    unmapped = ps.AlignmentFile(iter_out[-1], "wb", template=temp_bam)
    for r in temp_bam:
        # Do not write supplementary alignments (keeping 1 alignment/read)
        if r.query_name in remaining_reads and not r.is_supplementary:
            unmapped.write(r)
    unmapped.close()
    temp_bam.close()

    # Merge all aligned reads and unmapped reads into a single bam
    ps.merge("-n", "-O", "BAM", "-@", str(n_cpu), bam_out, *iter_out)
    logger.info(
        "{0} reads aligned / {1} total reads.".format(
            int(total_reads - len(remaining_reads)), int(total_reads)
        )
    )

    return 0


[docs]def truncate_reads(tmp_dir, infile, unaligned_set, trunc_len, first_round):
    """Trim read ends

    Writes the n first nucleotids of each sequence in infile to an auxiliary.
    file in the temporary folder.

    Parameters
    ----------

    tmp_dir : str
        Path to the temporary folder.
    infile : str
        Path to the fastq file to truncate.
    unaligned_set : set
        Contains the names of all reads that did not map unambiguously in
        previous rounds.
    trunc_len : int
        The number of basepairs to keep in each truncated sequence.
    first_round : bool
        If this is the first round, truncate all reads without checking mapping.
    
    Returns
    -------

    str :
        Path to the output fastq file containing truncated reads.
    """

    outfile = "{0}/truncated.fastq".format(tmp_dir)
    with ps.FastxFile(infile, "r") as inf, open(outfile, "w") as outf:
        for entry in inf:
            # If the read did not align in previous round or this is the first round
            if (entry.name in unaligned_set) or first_round:
                entry.sequence = entry.sequence[:trunc_len]
                entry.quality = entry.quality[:trunc_len]
                outf.write(str(entry) + "\n")
    return outfile


[docs]def filter_bamfile(temp_alignment, filtered_out, min_qual=30):
    """Filter alignment BAM files

    Reads all the reads in the input BAM alignment file.
    Write reads to the output file if they are aligned with a good
    quality, otherwise add their name in a set to stage them for the next round
    of alignment.

    Parameters
    ----------
    temp_alignment : str
        Path to the input temporary alignment.
    outfile : str
        Path to the output filtered temporary alignment.
    min_qual : int
        Minimum mapping quality required to keep a Hi-C pair.
    
    Returns
    -------
    set:
        Contains the names reads that did not align.
    """
    # Check the quality and status of each aligned fragment.
    # Write the ones with good quality in the final output file.
    # Keep those that do not map unambiguously for the next round.

    unaligned = set()
    temp_bam = ps.AlignmentFile(temp_alignment, "rb", check_sq=False)
    outf = ps.AlignmentFile(filtered_out, "wb", template=temp_bam)
    for r in temp_bam:
        if r.flag in [0, 16] and r.mapping_quality >= min_qual:
            outf.write(r)
        else:
            unaligned.add(r.query_name)

    logger.info("{0} reads left to map.".format(len(unaligned)))
    temp_bam.close()
    outf.close()

    return unaligned