Source code for hicstuff.commands

#!/usr/bin/env python3
# coding: utf-8

"""Abstract command classes for hicstuff

This module contains all classes related to hicstuff
commands:

    -iteralign (iterative mapping)
    -digest (genome chunking)
    -cutsite (preprocess fastq by cutting reads into digestion products)
    -filter (Hi-C 'event' sorting: loops, uncuts, weird
     and 'true contacts')
    -view (map visualization)
    -pipeline (whole contact map generation)
    -distancelaw (Analysis tool and plot for the distance law)

Running 'pipeline' implies running 'digest', but not
iteralign or filter unless specified, because they can
take up a lot of time for dimnishing returns.

Note
----
Structure based on Rémy Greinhofer (rgreinho) tutorial on subcommands in
docopt : https://github.com/rgreinho/docopt-subcommands-example
cmdoret, 20181412

Raises
------
NotImplementedError
    Will be raised if AbstractCommand is called for
    some reason instead of one of its children.
ValueError
    Will be raised if an incorrect chunking method (e.g.
    not an enzyme or number or invalid range view is
    specified.
"""
import re
import sys, os, shutil
import tempfile
from os.path import join, dirname
from matplotlib import pyplot as plt
from matplotlib import cm
from docopt import docopt
import pandas as pd
import numpy as np
import pysam as ps
import glob
import copy
from Bio import SeqIO
import hicstuff.view as hcv
import hicstuff.hicstuff as hcs
import hicstuff.cutsite as hcc
import hicstuff.digest as hcd
import hicstuff.iteralign as hci
import hicstuff.filter as hcf
from hicstuff.version import __version__
import hicstuff.io as hio
from hicstuff.log import logger
import hicstuff.pipeline as hpi
import hicstuff.distance_law as hcdl

DIVERGENT_CMAPS = [
    "PiYG",
    "PRGn",
    "BrBG",
    "PuOr",
    "RdGy",
    "RdBu",
    "RdYlBu",
    "RdYlGn",
    "Spectral",
    "coolwarm",
    "bwr",
    "seismic",
]


[docs]class AbstractCommand:
    """Abstract base command class

    Base class for the commands from which
    other hicstuff commands derive.
    """

    def __init__(self, command_args, global_args):
        """Initialize the commands"""
        self.args = docopt(self.__doc__, argv=command_args)
        self.global_args = global_args
        # Map Hi-C format to file extension
        self.fmt2ext = {"cool": ".cool", "bg2": ".bg2", "graal": ".mat.tsv"}

[docs]    def execute(self):
        """Execute the commands"""
        raise NotImplementedError

[docs]    def check_output_path(self, path, force=False):
        """Throws error if the output file exists. Create required file tree otherwise."""
        # Get complete output filename and prevent overwriting unless force is enabled
        if not force and os.path.exists(path):
            raise IOError(
                "Output file already exists. Use --force to overwrite"
            )
        if dirname(path):
            os.makedirs(dirname(path), exist_ok=True)


[docs]class Iteralign(AbstractCommand):
    """Iterative mapping command

    Truncate reads from a fastq file to 20 basepairs and iteratively extend and
    re-align the unmapped reads to optimize the proportion of uniquely aligned
    reads in a 3C library.

    usage:
        iteralign [--aligner=bowtie2] [--threads=1] [--min-len=20] [--read-len=INT]
                  [--tempdir=DIR] --out-bam=FILE --genome=FILE <reads.fq>

    arguments:
        reads.fq                Fastq file containing the reads to be aligned

    options:
        -g, --genome=FILE        The genome on which to map the reads. Must be
                                 the path to the bowtie2/bwa index if using bowtie2/bwa
                                 or to the genome in fasta format if using minimap2.
        -t, --threads=INT        Number of parallel threads allocated for the
                                 alignment [default: 1].
        -T, --tempdir=DIR        Temporary directory. Defaults to current
                                 directory.
        -a, --aligner=bowtie2    Choose alignment software between bowtie2,
                                 minimap2 or bwa. minimap2 should only be used for
                                 reads > 100 bp. [default: bowtie2]
        -l, --min-len=INT        Length to which the reads should be
                                 truncated [default: 20].
        -o, --out-bam=FILE       Path where the alignment will be written in
                                 BAM format.
        -R, --read-len=INT       Read length in input FASTQ file. If not provided,
                                 this is estimated from the first read in the file.
    """

[docs]    def execute(self):
        read_len = self.args["--read-len"]

        if read_len is not None:
            read_len = int(read_len)

        if not self.args["--tempdir"]:
            self.args["--tempdir"] = "."

        temp_directory = hio.generate_temp_dir(self.args["--tempdir"])

        hci.iterative_align(
            self.args["<reads.fq>"],
            temp_directory,
            self.args["--genome"],
            self.args["--threads"],
            self.args["--out-bam"],
            aligner=self.args["--aligner"],
            min_len=int(self.args["--min-len"]),
            read_len=read_len,
        )
        # Deletes the temporary folder
        shutil.rmtree(temp_directory)


[docs]class Digest(AbstractCommand):
    """Genome chunking command

    Digests a fasta file into fragments based on a restriction enzyme or a
    fixed chunk size. Generates two output files into the target directory
    named "info_contigs.txt" and "fragments_list.txt"

    usage:
        digest [--plot] [--figdir=FILE] [--force] [--circular] [--size=0]
               [--outdir=DIR] --enzyme=ENZ <fasta>

    arguments:
        fasta                     Fasta file to be digested

    options:
        -c, --circular                  Specify if the genome is circular.
        -e, --enzyme=ENZ[,ENZ2,...]     A restriction enzyme or an integer
                                        representing fixed chunk sizes (in bp).
                                        Multiple comma-separated enzymes can
                                        be given.
        -F, --force                     Write even if the output file already exists.
        -s, --size=INT                  Minimum size threshold to keep
                                        fragments. [default: 0]
        -o, --outdir=DIR                Directory where the fragments and
                                        contigs files will be written.
                                        Defaults to current directory.
        -p, --plot                      Show a histogram of fragment length
                                        distribution after digestion.
        -f, --figdir=FILE               Path to directory of the output figure.
                                        By default, the figure is only shown
                                        but not saved.

    output:
        fragments_list.txt: information about restriction fragments (or chunks)
        info_contigs.txt: information about contigs or chromosomes

    """

[docs]    def execute(self):
        # If circular is not specified, change it from None to False
        if not self.args["--circular"]:
            self.args["--circular"] = False
        if not self.args["--outdir"]:
            self.args["--outdir"] = os.getcwd()
        # Create output directory if it does not exist
        if os.path.exists(self.args["--outdir"]):
            if not self.args["--force"]:
                raise IOError(
                    "Output directory already exists. Use --force to overwrite"
                )
        else:
            os.makedirs(self.args["--outdir"], exist_ok=True)
        if self.args["--figdir"]:
            figpath = join(self.args["--figdir"], "frags_hist.pdf")
        else:
            figpath = None
        # Split into a list if multiple enzymes given
        enzyme = self.args["--enzyme"]
        if re.search(r",", enzyme):
            enzyme = enzyme.split(",")

        hcd.write_frag_info(
            self.args["<fasta>"],
            enzyme,
            self.args["--size"],
            output_dir=self.args["--outdir"],
            circular=self.args["--circular"],
        )

        hcd.frag_len(
            output_dir=self.args["--outdir"],
            plot=self.args["--plot"],
            fig_path=figpath,
        )


[docs]class Cutsite(AbstractCommand):
    """Cutsite command

    Generates new gzipped fastq files from original fastq. The function will cut
    the reads at their religation sites and creates new pairs of reads with the
    different fragments obtained after cutting at the digestion sites.

    There are three choices to how combine the fragments. 1. "for_vs_rev": All
    the combinations are made between one forward fragment and one reverse
    fragment. 2. "all": All 2-combinations are made. 3. "pile": Only
    combinations between adjacent fragments in the initial reads are made.

    usage:
        cutsite --forward=FILE --reverse=FILE  --prefix=STR --enzyme=STR
        [--mode=for_vs_rev] [--seed-size=20] [--threads=1]

    options:
        -1, --forward=FILE      Fastq file containing the forward reads to 
                                digest.
        -2, --reverse=FILE      Fastq file containing the reverse reads to 
                                digest.
        -p, --prefix=STR        Prefix of the path where to write the digested 
                                gzipped fastq files. Filenames will be added the
                                suffix "_{1,2}.fq.gz".
        -e, --enzyme=STR        The list of restriction enzyme used to digest
                                the genome separated by a comma. Example:
                                HpaII,MluCI.
        -m, --mode=STR          Digestion mode. There are three possibilities:
                                "for_vs_rev", "all" and "pile". The first one
                                "for_vs_rev" makes all possible contact between
                                fragments from forward read versus the fragments
                                of the reverse reads. The second one "all"
                                consist two make all pairs of fragments 
                                possible. The third one "pile" will make the
                                contacts only with the adjacent fragments.
                                [Default: for_vs_rev]
        -s, --seed-size=INT     Minimum size of a read. (i.e. seed size used 
                                in mapping as reads smaller won't be mapped.)
                                [Default: 20]
        -t, --threads=INT       Number of parallel threads allocated for the
                                alignement. [Default: 1]
    """

[docs]    def execute(self):

        # Check for mandatory options
        for option in ["--prefix", "--forward", "--reverse"]:
            if self.args[option] is None:
                raise ValueError(f"{option} is mandatory.")

        prefix = self.args["--prefix"]
        # Create output directory if it does not exist
        if dirname(prefix):
            os.makedirs(dirname(prefix), exist_ok=True)
        output_for = prefix + "_R1.fq.gz"
        output_rev = prefix + "_R2.fq.gz"
        # Digestion of the reads.
        logger.info("Digestion of the reads:")
        logger.info("Enzyme used: {0}".format(self.args["--enzyme"]))
        logger.info(
            "Mode used to cut the reads: {0}".format(self.args["--mode"])
        )
        hcc.cut_ligation_sites(
            self.args["--forward"],
            self.args["--reverse"],
            output_for,
            output_rev,
            enzyme=self.args["--enzyme"],
            mode=self.args["--mode"],
            seed_size=int(self.args["--seed-size"]),
            n_cpu=int(self.args["--threads"]),
        )


[docs]class Filter(AbstractCommand):
    """Mapping event filtering command

    Filters spurious 3C events such as loops and uncuts from the library based
    on a minimum distance threshold automatically estimated from the library by
    default. Can also plot 3C library statistics.

    usage:
        filter [--interactive | --thresholds INT-INT] [--plot]
               [--figdir FILE] [--prefix STR] <input> <output>

    arguments:
        input       2D BED file containing coordinates of Hi-C interacting
                    pairs, the index of their restriction fragment and their
                    strands.
        output      Path to the filtered file, in the same format as the input.

    options:
        -f, --figdir=DIR                  Path to the output figure directory.
                                          By default, the figure is only shown
                                          but not saved.
        -i, --interactive                 Interactively shows plots and asks
                                          for thresholds.
        -p, --plot                        Shows plots of library composition
                                          and 3C events abundance.
        -P, --prefix STR                  If the library has a name, it will
                                          be shown on the figures.
        -t, --thresholds=INT-INT          Manually defines integer values for
                                          the thresholds in the order
                                          [uncut, loop]. Reads above those values
                                          are kept.
    """

[docs]    def execute(self):
        figpath = None
        if self.args["--thresholds"]:
            # Thresholds supplied by user beforehand
            uncut_thr, loop_thr = self.args["--thresholds"].split("-")
            try:
                uncut_thr = int(uncut_thr)
                loop_thr = int(loop_thr)
            except ValueError:
                logger.error(
                    "You must provide integer numbers for the thresholds."
                )
        else:
            # Threshold defined at runtime
            if self.args["--figdir"]:
                figpath = join(self.args["--figdir"], "event_distance.pdf")
                if not os.path.exists(self.args["--figdir"]):
                    os.makedirs(self.args["--figdir"])
            uncut_thr, loop_thr = hcf.get_thresholds(
                self.args["<input>"],
                interactive=self.args["--interactive"],
                plot_events=self.args["--plot"],
                fig_path=figpath,
                prefix=self.args["--prefix"],
            )
        # Filter library and write to output file
        figpath = None
        if self.args["--figdir"]:
            figpath = join(self.args["--figdir"], "event_distribution.pdf")

        hcf.filter_events(
            self.args["<input>"],
            self.args["<output>"],
            uncut_thr,
            loop_thr,
            plot_events=self.args["--plot"],
            fig_path=figpath,
            prefix=self.args["--prefix"],
        )


[docs]class View(AbstractCommand):
    """Contact map visualization command

    Visualize a Hi-C matrix file as a heatmap of contact frequencies. Allows to
    tune visualisation by binning and normalizing the matrix, and to save the
    output image to disk. If no output is specified, the output is displayed.

    usage:
        view [--binning=1] [--despeckle] [--frags FILE] [--trim INT] [--n-mad 3.0] [--lines]
             [--normalize] [--min=0] [--max=99%] [--output=IMG] [--cmap=Reds] [--dpi=300]
             [--transform=STR] [--circular] [--region=STR] <contact_map> [<contact_map2>]

    arguments:
        contact_map             Sparse contact matrix in bg2, cool or graal format
        contact_map2            Sparse contact matrix in bg2, cool or graal format,
                                if given, the log ratio of contact_map/contact_map2
                                will be shown.


    options:
        -b, --binning=INT[bp|kb|Mb|Gb]   Rebin the matrix. If no unit is given, bins will
                                         be merged by groups of INT. If a unit is given,
                                         bins of that size will be generated. [default: 1]
        -c, --cmap=STR                   The name of a matplotlib colormap to
                                         use for the matrix. [default: Reds]
        -C, --circular                   Use if the genome is circular.
        -d, --despeckle                  Remove sharp increases in long range
                                         contact by averaging surrounding
                                         values.
        -D, --dpi=INT                    Map resolution in DPI (dots per inch). [default: 300]
        -f, --frags=FILE                 Required for bp binning and chromosome lines.
                                         Tab-separated file with headers, containing
                                         fragments start position in the 3rd
                                         column, as generated by hicstuff
                                         pipeline.
        -T, --transform=STR              Apply a mathematical transformation to pixel values
                                         to improve visibility of long range signals. Possible
                                         values are: log2, log10, ln, sqrt, exp0.2.
        -l, --lines                      Add dotted lines marking separation between chromosomes
                                         or contigs. Requires --frags.
        -M, --max=INT                    Saturation threshold. Maximum pixel
                                         value is set to this number. Can be
                                         followed by % to use a percentile of
                                         nonzero pixels in the contact
                                         map. [default: 99%]
        -m, --min=INT                    Minimum of the colorscale, works
                                         identically to --max. [default: 0]
        -N, --n-mad=INT                  Number of median absolute deviations (MAD) from the median
                                         of log bin sums allowed to keep bins in the normalization
                                         procedure [default: 3.0].
        -n, --normalize                  Should ICE normalization be performed
                                         before rendering the matrix ?
        -o, --output=FILE                Name of the image file where the view is stored.
        -r, --region=STR[;STR]           Only view a region of the contact map.
                                         Regions are specified as UCSC strings.
                                         (e.g.:chr1:1000-12000). If only one
                                         region is given, it is viewed on the
                                         diagonal. If two regions are given,
                                         The contacts between both are shown.
        -t, --trim=INT                   Trims outlier rows/columns from the
                                         matrix if the sum of their contacts
                                         deviates from the mean by more than
                                         INT standard deviations.
    """

[docs]    def data_transform(self, dense_map, operation="log10"):
        """
        Apply a mathematical operation on a dense Hi-C map. Valid
        operations are: log2, log10, ln, sqrt, exp0.2
        """
        ops = {
            "log10": np.log10,
            "log2": np.log2,
            "ln": np.log,
            "sqrt": np.sqrt,
        }
        if operation in ops:
            return ops[operation](dense_map)
        elif re.match(r"exp", operation):
            splitop = operation.split("exp")
            exp_val = float(splitop[1])
            return dense_map ** exp_val
        elif hasattr(np, operation) and callable(np.__dict__[operation]):
            logger.warning("Using built-in numpy callable: %s", operation)
            return np.__dict__[operation](dense_map)
        else:
            raise TypeError("Supplied transform function is not supported.")

[docs]    def process_matrix(self, sparse_map):
        """
        Performs any combination of binning, normalisation, log transformation,
        trimming and subsetting based on the attributes of the instance class.
        """
        # BINNING
        if self.binning > 1:
            if self.bp_unit:
                self.pos = self.frags.iloc[:, 2]
                binned_map, binned_pos = hcs.bin_bp_sparse(
                    M=sparse_map, positions=self.pos, bin_len=self.binning
                )
                # Get bin numbers of chromosome starts
                binned_start = np.append(
                    np.where(binned_pos == 0)[0], len(binned_pos)
                )
                # Get bin length of each chromosome
                num_binned = binned_start[1:] - binned_start[:-1]
                # Get unique chromosome names without losing original order
                # (numpy.unique sorts output)
                chr_names_idx = np.unique(
                    self.frags.iloc[:, 1], return_index=True
                )[1]
                chr_names = [
                    self.frags.iloc[index, 1]
                    for index in sorted(chr_names_idx)
                ]
                binned_chrom = np.repeat(chr_names, num_binned)
                binned_frags = pd.DataFrame(
                    {"chrom": binned_chrom, "start_pos": binned_pos[:, 0]}
                )
                binned_frags["end_pos"] = binned_frags.groupby("chrom")[
                    "start_pos"
                ].shift(-1)
                chrom_ends = self.frags.groupby("chrom").end_pos.max()
                # Fill ends of chromosome bins with actual chromosome length
                for cn in chrom_ends.index:
                    binned_frags.end_pos[
                        np.isnan(binned_frags.end_pos)
                        & (binned_frags.chrom == cn)
                    ] = chrom_ends[cn]

            else:
                # Note this is a basic binning procedure, chromosomes are
                # not taken into account -> last few fragments of a chrom
                # are merged with the first few of the next
                binned_map = hcs.bin_sparse(
                    M=sparse_map, subsampling_factor=self.binning
                ) 
                if self.frags:
                    binned_frags = self.frags.iloc[:: self.binning, :]
                    binned_frags = binned_frags.reset_index(drop=True)
                    # Since matrix binning ignores chromosomes, we
                    # have to do the same procedure with fragments
                    # we just correct the coordinates to start at 0
                    def shift_min(x):
                        try:
                            x[x == min(x)] = 0
                        except ValueError:
                            pass
                        return x

                    binned_frags.start_pos = binned_frags.groupby(
                        "chrom", sort=False
                    ).start_pos.apply(shift_min)
                else:
                    binned_frags = self.frags

        else:
            binned_map = sparse_map
            binned_frags = self.frags

        # TRIMMING
        if self.args["--trim"]:
            try:
                trim_std = float(self.args["--trim"])
            except ValueError:
                logger.error(
                    "You must specify a number of standard deviations for "
                    "trimming"
                )
                raise
            binned_map = hcs.trim_sparse(binned_map, n_mad=trim_std)

        # NORMALIZATION
        if self.args["--normalize"]:
            binned_map = hcs.normalize_sparse(
                binned_map, norm="ICE", n_mad=float(self.args["--n-mad"])
            )

        # ZOOM REGION
        if self.args["--region"]:
            if self.args["--lines"]:
                raise NotImplementedError(
                    "Chromosome lines are currently incompatible with a region zoom"
                )
            if self.frags is None:
                logger.error(
                    "A fragment file must be provided to subset "
                    "genomic regions. See hicstuff view --help"
                )
                sys.exit(1)
            # Load chromosomes and positions from fragments list
            reg_pos = binned_frags[['chrom', 'start_pos']]
            region = self.args["--region"]
            if ";" in region:
                # 2 input regions: zoom anywhere in matrix
                self.symmetric = False
                reg1, reg2 = region.split(";")
                reg1 = parse_ucsc(reg1, reg_pos)
                reg2 = parse_ucsc(reg2, reg_pos)
            else:
                # Only 1 input region: zoom on diagonal
                region = parse_ucsc(region, reg_pos)
                reg1 = reg2 = region
            binned_map = binned_map.tocsr()
            binned_map = binned_map[reg1[0] : reg1[1], reg2[0] : reg2[1]]
            binned_map = binned_map.tocoo()

        return binned_map, binned_frags

[docs]    def execute(self):

        input_map = self.args["<contact_map>"]
        hic_fmt = hio.get_hic_format(input_map)
        cmap = self.args["--cmap"]
        # Switch to a divergent colormap for plotting ratios
        if (
            self.args["<contact_map2>"] is not None
            and cmap not in DIVERGENT_CMAPS
        ):
            # In case user specified a custom cmap incompatible with ratios
            if cmap != "Reds":
                logger.warning(
                    "You chose a non-divergent colormap. Valid divergent "
                    "cmaps are:\n\t{}".format(" ".join(DIVERGENT_CMAPS))
                )
            logger.info(
                "Defaulting to seismic colormap for ratios. You can pick "
                "another divergent colormap if you wish."
            )
            cmap = "seismic"
        self.bp_unit = False
        bin_str = self.args["--binning"].upper()
        self.symmetric = True
        transform = self.args["--transform"]
        try:
            # Subsample binning
            self.binning = int(bin_str)
        except ValueError:
            if re.match(r"^[0-9]+[KMG]?B[P]?$", bin_str):
                if hic_fmt == "graal" and not self.args["--frags"]:
                    logger.error(
                        "A fragment file must be provided to perform "
                        "basepair binning. See hicstuff view --help"
                    )
                    sys.exit(1)
                # Load positions from fragments list
                self.binning = parse_bin_str(bin_str)
                self.bp_unit = True
            else:
                logger.error(
                    "Please provide an integer or basepair value for binning."
                )
                raise
        sparse_map, self.frags, _ = hio.flexible_hic_loader(
            input_map, fragments_file=self.args["--frags"], quiet=True
        )
        output_file = self.args["--output"]
        processed_map, frags = self.process_matrix(sparse_map)
        # If 2 matrices given compute log ratio
        if self.args["<contact_map2>"]:
            sparse_map2, _, _ = hio.flexible_hic_loader(
                self.args["<contact_map2>"],
                fragments_file=self.args["--frags"],
                quiet=True,
            )
            processed_map2, _ = self.process_matrix(sparse_map2)
            if sparse_map2.shape != sparse_map.shape:
                logger.error(
                    "You cannot compute the ratio of matrices with "
                    "different dimensions"
                )
            # Get log of values for both maps
            processed_map.data = np.log2(processed_map.data)
            processed_map2.data = np.log2(processed_map2.data)
            # Note: Taking diff of logs instead of log of ratio because sparse
            # mat division yields dense matrix in current implementation.
            # Changing base to 2 afterwards.
            processed_map = processed_map.tocsr() - processed_map2.tocsr()
            processed_map = processed_map.tocoo()
            processed_map.data[np.isnan(processed_map.data)] = 0.0
            # Log transformation done already
            transform = False

        if self.args["--despeckle"]:
            processed_map = hcs.despeckle_simple(processed_map)
        try:
            if self.symmetric:
                dense_map = hcv.sparse_to_dense(
                    processed_map, remove_diag=False
                )
            else:
                dense_map = processed_map.toarray()

            def set_v(v, mat):
                if "%" in v:
                    try:
                        valid_pixels = (mat > 0) & (np.isfinite(mat))
                        val = np.percentile(
                            mat[valid_pixels], float(v.strip("%"))
                        )
                    # No nonzero / finite value
                    except IndexError:
                        val = 0
                else:
                    val = float(v)
                return val

            dense_map = dense_map.astype(float)
            self.vmax = set_v(self.args["--max"], dense_map)
            self.vmin = set_v(self.args["--min"], dense_map)
            if self.args["<contact_map2>"]:
                self.vmin, self.vmax = -2, 2
            # Log transform the map and the colorscale limits if needed
            if transform:
                dense_map = self.data_transform(dense_map, transform)
                # self.vmin = np.percentile(dense_map[np.isfinite(dense_map)], 1)
                # self.vmax = self.data_transform(self.vmax, transform)
                self.vmax = set_v(self.args["--max"], dense_map)
                self.vmin = set_v(self.args["--min"], dense_map)
            else:
                # Set 0 values in matrix to NA
                dense_map[dense_map == 0] = np.inf
            # Get chromosome coordinates if required
            if self.args["--lines"]:
                chrom_starts = np.where(np.diff(frags.start_pos) < 0)[0] + 1
            else:
                chrom_starts = None
            # Display NA values in white
            current_cmap = cm.get_cmap().copy()
            current_cmap.set_bad(color=current_cmap(0))

            hcv.plot_matrix(
                dense_map,
                filename=output_file,
                vmin=self.vmin,
                vmax=self.vmax,
                dpi=int(self.args["--dpi"]),
                cmap=cmap,
                chrom_starts=chrom_starts,
            )
        except MemoryError:
            logger.error("contact map is too large to load, try binning more")


[docs]class Pipeline(AbstractCommand):
    """Whole (end-to-end) contact map generation command

    Entire Pipeline to process fastq files into a Hi-C matrix. Uses all the
    individual components of hicstuff.

    usage:
        pipeline [--aligner=bowtie2] [--centromeres=FILE] [--circular] [--distance-law]
                 [--duplicates] [--enzyme=5000] [--filter] [--force] [--mapping=normal]
                 [--matfmt=graal] [--no-cleanup] [--outdir=DIR] [--plot] [--prefix=PREFIX]
                 [--quality-min=30] [--read-len=INT] [--remove-centromeres=0] [--size=0]
                 [--start-stage=fastq] [--threads=1] [--tmpdir=DIR] --genome=FILE <input1> [<input2>]

    arguments:
        input1:             Forward fastq file, if start_stage is "fastq", sam
                            file for aligned forward reads if start_stage is
                            "bam", or a .pairs file if start_stage is "pairs".
        input2:             Reverse fastq file, if start_stage is "fastq", sam
                            file for aligned reverse reads if start_stage is
                            "bam", or nothing if start_stage is "pairs".


    options:
        -a, --aligner=STR             Alignment software to use. Can be either
                                      bowtie2, minimap2 or bwa. minimap2 should
                                      only be used for reads > 100 bp.
                                      [default: bowtie2]
        -c, --centromeres=FILE        Positions of the centromeres separated by
                                      a space and in the same order than the
                                      chromosomes. Discordant with the circular
                                      option.
        -C, --circular                Enable if the genome is circular.
                                      Discordant with the centromeres option.
        -d, --distance-law            If enabled, generates a distance law file
                                      with the values of the probabilities to
                                      have a contact between two distances for
                                      each chromosomes or arms if the file with
                                      the positions has been given. The values
                                      are not normalized, or averaged.
        -D, --duplicates              Filter out PCR duplicates based on read
                                      positions.
        -e, --enzyme={STR|INT}        Restriction enzyme or "mnase" if a string,
                                      or chunk size (i.e. resolution) if a number.
                                      Can also be multiple comma-separated
                                      enzymes. [default: 5000]
        -f, --filter                  Filter out spurious 3C events (loops and
                                      uncuts) using hicstuff filter. Requires
                                      "-e" to be a restriction enzyme or mnase,
                                      not a chunk size. For more informations, see
                                      Cournac et al. BMC Genomics, 2012.
        -F, --force                   Write even if the output file already exists.
        -g, --genome=FILE             Reference genome to map against. Path to
                                      the bowtie2/bwa index if using bowtie2/bwa,
                                      or to a FASTA file if using minimap2.
        -m, --mapping=STR             normal|iterative|cutsite. Parameter of 
                                      mapping. "normal": Directly map reads 
                                      without any process. "iterative": Map 
                                      reads iteratively using iteralign, by 
                                      truncating reads to 20bp and then 
                                      repeatedly extending to align them.
                                      "cutsite": Cut reads at the religation 
                                      sites of the given enzyme using cutsite, 
                                      create new pairs of reads and then align 
                                      them ; enzyme is required [default: normal].
        -M, --matfmt=STR              The format of the output sparse matrix.
                                      Can be "bg2" for 2D Bedgraph format,
                                      "cool" for Mirnylab's cooler software, or
                                      "graal" for graal-compatible plain text
                                      COO format. [default: graal]
        -n, --no-cleanup              If enabled, intermediary BED files will
                                      be kept after generating the contact map.
                                      Disabled by defaut.
        -o, --outdir=DIR              Output directory. Defaults to the current
                                      directory.
        -p, --plot                    Generates plots in the output directory
                                      at different steps of the pipeline.
        -P, --prefix=STR              Overrides default filenames and prefixes all
                                      output files with a custom name.
        -q, --quality-min=INT         Minimum mapping quality for selecting
                                      contacts. [default: 30].
        -r, --remove-centromeres=INT  Integer. Number of kb that will be remove around
                                      the centromere position given by in the centromere
                                      file. [default: 0]
        -R, --read-len=INT            Maximum read length in the fastq file. Optionally
                                      used in iterative alignment mode. Estimated from
                                      the first read by default. Useful if input fastq
                                      is a composite of different read lengths.
        -s, --size=INT                Minimum size threshold to consider
                                      contigs. Keep all contigs by default.
                                      [default: 0]
        -S, --start-stage=STR         Define the starting point of the pipeline
                                      to skip some steps. Default is "fastq" to
                                      run from the start. Can also be "bam" to
                                      skip the alignment, "pairs" to start from a
                                      single pairs file or "pairs_idx" to skip
                                      fragment attribution and only build the
                                      matrix. [default: fastq]
        -t, --threads=INT             Number of threads to allocate.
                                      [default: 1].
        -T, --tmpdir=DIR              Directory for storing intermediary BED
                                      files and temporary sort files. Defaults
                                      to the output directory.

    output:
        abs_fragments_contacts_weighted.txt: the sparse contact map
        fragments_list.txt: information about restriction fragments (or chunks)
        info_contigs.txt: information about contigs or chromosomes
        hicstuff.log: details and statistics about the run.
    """

[docs]    def execute(self):

        if self.args["--filter"] and self.args["--enzyme"].isdigit():
            raise ValueError(
                "You cannot filter without specifying a restriction enzyme."
            )
        elif self.args["--enzyme"] in ("mnase", "dnase"):
            logger.info("## Enzyme provided is 'mnase', setting bin-size to 100bp")
            self.args["--enzyme"] = 100

        if not self.args["--outdir"]:
            self.args["--outdir"] = os.getcwd()

        if self.args["--matfmt"] not in ("graal", "bg2", "cool"):
            logger.error("matfmt must be either bg2, cool or graal.")
            raise ValueError

        read_len = self.args["--read-len"]
        if read_len is not None:
            read_len = int(read_len)

        hpi.full_pipeline(
            genome=self.args["--genome"],
            input1=self.args["<input1>"],
            input2=self.args["<input2>"],
            aligner=self.args["--aligner"],
            centromeres=self.args["--centromeres"],
            circular=self.args["--circular"],
            distance_law=self.args["--distance-law"],
            enzyme=self.args["--enzyme"],
            filter_events=self.args["--filter"],
            force=self.args["--force"],
            mapping=self.args["--mapping"],
            mat_fmt=self.args["--matfmt"],
            min_qual=int(self.args["--quality-min"]),
            min_size=int(self.args["--size"]),
            no_cleanup=self.args["--no-cleanup"],
            out_dir=self.args["--outdir"],
            pcr_duplicates=self.args["--duplicates"],
            plot=self.args["--plot"],
            prefix=self.args["--prefix"],
            read_len=read_len,
            remove_centros=self.args["--remove-centromeres"],
            start_stage=self.args["--start-stage"],
            threads=int(self.args["--threads"]),
            tmp_dir=self.args["--tmpdir"],
        )


[docs]class Scalogram(AbstractCommand):
    """
    Generate a scalogram.

    usage:
        scalogram [--cmap=viridis] [--centromeres=FILE] [--frags=FILE] [--range=INT-INT]
                  [--threads=1] [--output=FILE] [--normalize]
                  [--indices=INT-INT] [--despeckle] <contact_map>

    argument:
        <contact_map> The sparse Hi-C contact matrix.

    options:
        -C, --cmap=STR                     The matplotlib colormap to use for
                                           the plot. [default: viridis]
        -d, --despeckle                    Remove speckles (artifactual spots)
                                           from the matrix.
        -f, --frags=FILE                   Fragments_list.txt file providing mapping
                                           between genomic coordinates and bin IDs.
        -i, --indices=INT-INT              The range of bin numbers of the matrix to
                                           use for the plot. Can also be given in
                                           UCSC style genomic coordinates (requires -f).
                                           E.g. chr1:1Mb-10Mb.
        -o, --output=FILE                  Output file where the plot should be
                                           saved. Plot is only displayed by
                                           default.
        -n, --normalize                    Normalize the matrix first.
        -r, --range=INT-INT                The range of contact distance to look
                                           at. No limit by default. Values in
                                           basepairs by default but a unit can
                                           be specified (kb, Mb, ...).
        -t, --threads=INT                  Parallel processes to run in for
                                           despeckling. [default: 1]
    """

[docs]    def execute(self):
        mat, frags, _ = hio.flexible_hic_loader(
            self.args["<contact_map>"], fragments_file=self.args["--frags"]
        )
        if frags is not None:
            # If fragments_list.txt is provided, load chrom start and end columns
            frags = pd.read_csv(
                self.args["--frags"], delimiter="\t", usecols=(1, 2, 3)
            )
        if self.args["--range"]:
            shortest, longest = self.args["--range"].split("-")
            # If range given in number of bins
            try:
                shortest, longest = int(shortest), int(longest)
            # If range given in genomic scale
            except ValueError:
                shortest, longest = (
                    parse_bin_str(shortest),
                    parse_bin_str(longest),
                )
                # Use average bin size to convert genomic scale to number of bins
                avg_res = (frags.end_pos - frags.start_pos).mean()
                shortest, longest = (
                    int(shortest // avg_res),
                    int(longest // avg_res),
                )

        if self.args["--indices"]:
            start, end = self.args["--indices"].split("-")
            # If given in bin numbers
            try:
                start = int(start)
                end = int(end)
            # If given in genomic coordinates
            except ValueError:
                start, end = parse_ucsc(
                    self.args["--indices"],
                    frags.loc[:, ["chrom", "start_pos"]],
                )

        output_file = self.args["--output"]
        # good_bins = np.array(range(S.shape[0]))
        S = mat.tocsr()
        if not self.args["--range"]:
            shortest = 0
            longest = S.shape[0]

        if self.args["--normalize"]:
            # good_bins = np.where(hcs.get_good_bins(S, n_std=3) == 1)[0]
            S = hcs.normalize_sparse(S, norm="ICE")
            S = S.tocsr()
        if self.args["--despeckle"]:
            S = hcs.despeckle_simple(S, threads=int(self.args["--threads"]))

        # Cropping matrix before transforming to dense to reduce memory overhead
        # Note we leave a margin equal to longest range so that all windows can be computed
        if self.args["--indices"]:
            crop_inf, crop_sup = (
                max(0, start - longest),
                min(S.shape[0], end + longest),
            )
            crop_later = longest
            S = S[crop_inf:crop_sup, crop_inf:crop_sup]
        else:
            crop_later = 0

        D = hcv.sparse_to_dense(S)
        D = np.fliplr(np.rot90(hcs.scalogram(D), k=-1))
        # Crop the margin left previously to get actual indices on dimenstion 0
        # and focus scale to --range on dimension 1
        plt.contourf(
            D[crop_later : D.shape[1] - crop_later, shortest:longest],
            cmap=self.args["--cmap"],
        )
        if output_file:
            plt.savefig(output_file)
        else:
            plt.show()


[docs]class Rebin(AbstractCommand):
    """
    Rebins a Hi-C matrix and modifies its fragment and chrom files accordingly.
    Output files are in the same format as the input files (cool, graal or bg2).
    usage:
        rebin [--binning=1] [--frags=FILE] [--force] [--chroms=FILE] <contact_map> <out_prefix>

    arguments:
        contact_map             Sparse contact matrix in graal, cool or bg2 format.
        out_prefix              Prefix path (without extension) for the output files.

    options:
        -b, --binning=INT[bp|kb|Mb|Gb]   Subsampling factor or fix value in
                                         basepairs to use for binning
                                         [default: 1].
        -f, --frags=FILE                 Tab-separated file with headers,
                                         containing fragments start position in
                                         the 3rd column. This is the file
                                         "fragments_list.txt" generated by
                                         hicstuff pipeline. Required for graal
                                         matrices and recommended for bg2.
        -F, --force                      Write even if the output file already exists.
        -c, --chroms=FILE                Tab-separated with headers, containing
                                         chromosome names, size, number of
                                         restriction fragments. This is the file
                                         "info_contigs.txt" generated by hicstuff
                                         pipeline.
    """

[docs]    def execute(self):
        prefix = self.args["<out_prefix>"]
        bin_str = self.args["--binning"].upper()
        hic_fmt = hio.get_hic_format(self.args["<contact_map>"])
        # Get complete output filename and prevent overwriting unless --force is enabled
        out_name = prefix + self.fmt2ext[hic_fmt]
        self.check_output_path(out_name, force=self.args["--force"])
        # Load positions from fragments list and chromosomes from chrom file
        map_path = self.args["<contact_map>"]
        hic_map, frags, chromlist = hio.flexible_hic_loader(
            map_path,
            fragments_file=self.args["--frags"],
            chroms_file=self.args["--chroms"],
        )
        if hic_fmt == "graal" and (frags is None or chromlist is None):
            raise ValueError(
                "You must provide a chroms file and a fragments file "
                "when rebinning a matrix in graal format. (hint: the "
                "files info_contigs.txt and fragments_list.txt)"
            )
        # Create output directory if it does not exist
        if dirname(prefix):
            os.makedirs(dirname(prefix), exist_ok=True)
        bp_unit = False
        try:
            # Subsample binning
            binning = int(bin_str)
        except ValueError:
            # Basepair binning: determine bin size
            if re.match(r"^[0-9]+[KMG]?B[P]?$", bin_str):
                binning = parse_bin_str(bin_str)
                bp_unit = True
            else:
                logger.error(
                    "Please provide an integer or basepair value for binning."
                )
                raise
        chromnames = np.unique(frags.chrom)

        if bp_unit:
            # Basepair binning: Perform binning
            hic_map, _ = hcs.bin_bp_sparse(hic_map, frags.start_pos, binning)
            for chrom in chromnames:
                chrom_mask = frags.chrom == chrom
                # For all chromosomes, get new bin start positions
                bin_id = frags.loc[chrom_mask, "start_pos"] // binning
                frags.loc[chrom_mask, "id"] = bin_id + 1
                frags.loc[chrom_mask, "start_pos"] = binning * bin_id
                bin_ends = binning * bin_id + binning
                # Do not allow bin ends to be larger than chrom size
                try:
                    chromsize = chromlist.length[
                        chromlist.contig == chrom
                    ].values[0]
                except AttributeError:
                    chromsize = chromlist["length_kb"][
                        chromlist.contig == chrom
                    ].values[0]
                bin_ends[bin_ends > chromsize] = chromsize
                frags.loc[frags.chrom == chrom, "end_pos"] = bin_ends

            # Account for special cases where restriction fragments are larger than
            # bin size, resulting in missing bins (i.e. jumps in bin ids)
            id_diff = (
                np.array(frags.loc[:, "id"])[1:]
                - np.array(frags.loc[:, "id"])[:-1]
            )
            # Normal jump is 1, new chromosome (reset id) is < 0, abnormal is > 1
            # Get panda indices of abnormal jumps
            jump_frag_idx = np.where(id_diff > 1)[0]
            add_bins = id_diff - 1
            # Need to insert [jump] bins after indices with abnormal [jump]
            miss_bins = [None] * np.sum(add_bins[jump_frag_idx])
            miss_bin_id = 0
            for idx in jump_frag_idx:
                jump_size = add_bins[idx]
                for j in range(1, jump_size + 1):
                    # New generated bins will be given attributes based on the previous bin
                    # e.g. if 2 missing bins between bins 2 and 5:
                    # id[3] = id[2] + 1 * 1 and id[4] = id[2] + 1 * 2
                    miss_bins[miss_bin_id] = {
                        "id": frags.loc[idx, "id"] + 1 * j,
                        "chrom": frags.loc[idx, "chrom"],
                        "start_pos": frags.loc[idx, "start_pos"] + binning * j,
                        "end_pos": frags.loc[idx, "end_pos"] + binning * j,
                        "size": binning,
                        "gc_content": np.NaN,
                    }
                    miss_bin_id += 1
                    # Shift bins row idx to allow

            # Give existing bins spaced row idx to allow inserting missing bins
            idx_shift = copy.copy(id_diff)
            idx_shift[idx_shift < 1] = 1
            existing_bins_idx = np.cumsum(idx_shift)
            # Prepend first bin (lost when computing diff)
            existing_bins_idx = np.insert(existing_bins_idx, 0, 0)
            # Add missing bins to original table, and sort by idx
            # missing bins are "holes" in the continuous range of existing bins
            missing_bins_idx = sorted(
                set(range(existing_bins_idx[0], existing_bins_idx[-1]))
                - set(existing_bins_idx)
            )
            miss_bins_df = pd.DataFrame(
                miss_bins, columns=frags.columns, index=missing_bins_idx
            )
            frags["tmp_idx"] = existing_bins_idx
            miss_bins_df["tmp_idx"] = missing_bins_idx
            frags = pd.concat([frags, miss_bins_df], axis=0, sort=False)
            frags.sort_values("tmp_idx", axis=0, inplace=True)
            frags.drop("tmp_idx", axis=1, inplace=True)

        else:
            # Subsample binning
            hic_map = hcs.bin_sparse(hic_map, binning)
            # Use index for binning, but keep 1-indexed.
            # Exception when binning is 1 (no binning) where no need to shift
            shift_id = 0 if binning == 1 else 1
            frags.id = (frags.id // binning) + shift_id

        # Save original columns order
        col_ordered = list(frags.columns)
        # Get new start and end position for each bin
        frags = frags.groupby(["chrom", "id"], sort=False, observed=True)
        positions = frags.agg({"start_pos": "min", "end_pos": "max"})
        positions.reset_index(inplace=True)
        # Compute mean for all added features in each index bin
        # Normally only other feature is GC content
        try:
            features = frags.agg("mean")
            features.reset_index(inplace=True)
            # set new bins positions
            frags = features
            frags["start_pos"] = 0
            frags["end_pos"] = 0
            frags.loc[:, positions.columns] = positions
        except pd.core.base.DataError:
            frags = positions
        frags["size"] = frags.end_pos - frags.start_pos
        cumul_bins = 0
        for chrom in chromnames:
            # Update cumulative length column in chromlist
            chrom_frags = frags.chrom == chrom
            n_bins = frags.start_pos[chrom_frags].shape[0]
            chromlist.loc[chromlist.contig == chrom, "n_frags"] = n_bins
            chromlist.loc[
                chromlist.contig == chrom, "cumul_length"
            ] = cumul_bins
            cumul_bins += n_bins
            # Adjust each chromosome's last bin end to match chromsize
            last_frag_end = frags.loc[chrom_frags, 'end_pos'].max()
            chromlen = chromlist.loc[chromlist.contig == chrom, 'length'].values[0]
            frags.loc[chrom_frags & (frags.end_pos == last_frag_end), 'end_pos'] = chromlen


        # Keep original column order
        frags = frags.reindex(columns=col_ordered)
        # Write 3 binned output files
        hio.flexible_hic_saver(
            hic_map,
            self.args["<out_prefix>"],
            frags=frags,
            chroms=chromlist,
            hic_fmt=hic_fmt,
        )


[docs]class Subsample(AbstractCommand):
    """
    Subsample contacts from a Hi-C matrix. Probability of sampling is proportional
    to the intensity of the bin.
    usage:
        subsample  [--prop=0.1] [--force] <contact_map> <subsampled_prefix>

    arguments:
        contact_map             Sparse contact matrix in graal, bg2 or cool format.
        subsampled_prefix       Path without extension to the output map in the same
                                format as the input containing only a fraction of the
                                contacts.

    options:
        -p, --prop=FLOAT        Proportion of contacts to sample from the input matrix
                                if between 0 and 1. Raw number of contacts to keep if
                                superior to 1. [default: 0.1]
        -F, --force             Write even if the output file already exists.
    """

[docs]    def execute(self):
        hic_fmt = hio.get_hic_format(self.args["<contact_map>"])
        prefix = self.args["<subsampled_prefix>"]
        # Get complete output filename and prevent overwriting unless --force is enabled
        out_name = prefix + self.fmt2ext[hic_fmt]
        self.check_output_path(out_name, force=self.args["--force"])
        mat, frags, _ = hio.flexible_hic_loader(
            self.args["<contact_map>"], quiet=True
        )
        subsampled = hcs.subsample_contacts(mat, float(self.args["--prop"]))
        subsampled = subsampled.tocoo()
        hio.flexible_hic_saver(
            subsampled,
            prefix,
            frags=frags,
            hic_fmt=hic_fmt,
            quiet=True,
        )


[docs]class Convert(AbstractCommand):
    """
    Convert between different Hi-C dataformats. Currently supports tsv (graal),
    bedgraph2D (bg2) and cooler (cool). Input format is automatically inferred.

    usage:
        convert [--frags=FILE] [--chroms=FILE] [--force] [--genome=FILE]
                [--to=cool] <contact_map> <prefix>

    arguments:
        contact_map               The file containing the contact frequencies.
        prefix                    The prefix path for output files. An extension
                                  will be added to the files depending on the
                                  output format.

    options:
        -f, --frags=FILE          Tab-separated file with headers,
                                  containing columns id, chrom, start_pos,
                                  end_pos size. This is the file
                                  "fragments_list.txt" generated by
                                  hicstuff pipeline. Required for graal
                                  matrices and recommended for bg2.
        -F, --force               Write even if the output file already exists.
        -g, --genome=FILE         Optional genome file used to add a GC content
                                  column to the fragments table. This is
                                  required to generate instagraal-compatible
                                  files.
        -c, --chroms=FILE         Tab-separated with headers, containing
                                  columns contig, length, n_frags, cumul_length.
                                  This is the file "info_contigs.txt" generated
                                  by hicstuff pipeline.
        -T, --to=STR              The format to which files should be
                                  converted. [default: cool]
    """

[docs]    def execute(self):
        out_fmt = self.args["--to"]
        mat_path = self.args["<contact_map>"]
        frags_path = self.args["--frags"]
        genome_path = self.args["--genome"]
        chroms_path = self.args["--chroms"]
        prefix = self.args["<prefix>"]

        # Get complete output filename and prevent overwriting unless --force is enabled
        out_name = prefix + self.fmt2ext[out_fmt]
        self.check_output_path(out_name, force=self.args["--force"])

        # Load
        mat, frags, chroms = hio.flexible_hic_loader(
            mat_path,
            fragments_file=frags_path,
            chroms_file=chroms_path,
            quiet=True,
        )

        # Modify fragments for instagraal compatibility
        # Add fragments size column
        chrom_col, start_col, end_col = hio.get_pos_cols(frags)
        size = frags[end_col] - frags[start_col]
        if "size" not in frags.columns:
            frags = frags.join(pd.DataFrame({"size": size}))
        # If genome was provided, add gc_content column
        if genome_path:
            gc = hio.gc_bins(genome_path, frags)
            frags = frags.join(pd.DataFrame({"gc_content": gc}))

        # Write
        mat = mat.astype(int)
        hio.flexible_hic_saver(
            mat=mat,
            out_prefix=prefix,
            frags=frags,
            chroms=chroms,
            hic_fmt=out_fmt,
        )


[docs]class Distancelaw(AbstractCommand):
    """Distance law tools.
    Take the distance law file from hicstuff and can average it, normalize it compute the
    slope of the curve and plot it.

    usage:
        distancelaw [--average] [--big-arm-only=INT] [--base=1.1] [--centromeres=FILE]
                    [--circular] [--frags=FILE] [--inf=INT] [--outputfile-img=FILE]
                    [--outputfile-tabl=FILE] [--labels=DIR] [--sup=INT]
                    [--remove-centromeres=0] (--pairs=FILE | --dist-tbl=FILE1[,FILE2,...])

    options:
        -a, --average                       If set, calculate the average of the distance
                                            law of the different chromosomes/arms in each
                                            condition. If two file given average is
                                            mandatory.
        -b, --big-arm-only=INT              Integer. It given will take only the arms bigger
                                            than the value given as argument.
        -B, --base=FLOAT                    Float corresponding of the base of the log to
                                            make the logspace which will slice the genomes
                                            in logbins. These slices will be in basepairs
                                            unit. [default is 1.1]
        -c, --centromeres=FILE              Positions of the centromeres separated by
                                            a space and in the same order as the
                                            chromosomes. This allows to plot chromosomal arms
                                            separately. Note this will only work with --pairs
                                            input, as the distance law needs to be recomputed.
                                            Incompatible with the circular option.
        -C, --circular                      Enable if the genome is circular. Discordant
                                            with the centromeres option.
        -d, --dist-tbl=FILE1[,FILE2,...]    Directory to the file or files containing the
                                            compute distance law. File should have the same
                                            format than the ones made by hicstuff pipeline.
        -f, --frags=FILE                    Tab-separated file with headers, containing
                                            columns id, chrom, start_pos, end_pos size.
                                            This is the file "fragments_list.txt" generated by
                                            hicstuff pipeline. Required if pairs are given.
        -i, --inf=INT                       Inferior born to plot the distance law. By
                                            default the value is 3000 bp (3 kb). Have to
                                            be strictly positive.
        -l, --labels=STR1,STR2...           List of string of the labels for the plot
                                            separated by a coma. If no labels given, give
                                            the names "Sample 1", "Sample 2"...
        -o, --outputfile-img=FILE           Output file. Format must be compatible with
                                            plt.savefig. Default : None.
        -O, --outputfile-tabl=TABLE         Output file. Default : None.
        -p, --pairs=FILE                    Pairs file. Format from 4D Nucleome Omics Data
                                            Standards Working Group with the 8th and 9th
                                            coulumns are the ID of the fragments of the
                                            reads 1 and 2. Only add if no distance_law table
                                            given. It will compute the table from these pairs
                                            and the fragments from the fragments file.
        -r, --remove-centromeres=INT        Integer. Number of kb that will be remove around
                                            the centromere position given by in the centromere
                                            file. [default: 0]
        -s, --sup=INT                       Superior born to plot the distance law. By
                                            default the value is the maximum length of all
                                            the dataset given. Also if big arm only set, it
                                            will be the minimum size of the arms/chromosomes
                                            taken to make the average.
    """

[docs]    def execute(self):
        # Give no file as output_file_img if no given.
        if self.args["--outputfile-img"]:
            output_file_img = self.args["--outputfile-img"]
        else:
            output_file_img = None
        # Compute the table of distance law if pairs given
        if self.args["--pairs"]:
            # Sanity check : frags mandatory if pairs given.
            if not self.args["--frags"] or self.args["--dist-tbl"]:
                logger.error(
                    "You have to give fragments and/or not give table of the distance law if pairs file given."
                )
                sys.exit(1)
            pairs = self.args["--pairs"]
            fragments = self.args["--frags"]
            # Give no file as output_file_tabl if no given.
            if self.args["--outputfile-tabl"]:
                output_file_tabl = self.args["--outputfile-tabl"]
            else:
                output_file_tabl = None
            # Check if centromeres file given and if the centromeres have to be removed
            if self.args["--centromeres"]:
                centromeres = self.args["--centromeres"]
                rm_centro = int(self.args["--remove-centromeres"])
            else:
                centromeres = None
                rm_centro = 0
            # Check if circular condition given
            if self.args["--circular"]:
                circular = self.args["--circular"]
            else:
                circular = None
            # Check logarithm base
            if self.args["--base"]:
                base = float(self.args["--base"])
            else:
                base = 1.1
            xs, ps, names = [None], [None], [None]
            xs[0], ps[0], names[0] = hcdl.get_distance_law(
                pairs_reads_file=pairs,
                fragments_file=fragments,
                centro_file=centromeres,
                base=base,
                out_file=output_file_tabl,
                circular=circular,
                rm_centro=rm_centro,
            )
            length_files = 1
        else:
            # Put in a list the path or the different paths given.
            distance_law_file = self.args["--dist-tbl"]
            distance_law_files = distance_law_file.split(",")
            length_files = len(distance_law_files)
            # Make new lists for the modified distance law.
            xs = [None] * length_files
            ps = [None] * length_files
            names = [None] * length_files
            # Iterate on the different file given by the user.
            for i in range(length_files):
                xs[i], ps[i], names[i] = hcdl.import_distance_law(
                    distance_law_files[i]
                )
            names = [name[0] for name in names]
        # Put the inf and sup according to the arguments given.
        if self.args["--inf"]:
            inf = int(self.args["--inf"])
        else:
            inf = 3000
        if self.args["--sup"]:
            sup = int(self.args["--sup"])
        else:
            sup = max(max(xs[0], key=len))
        # Add the option big army only.
        if self.args["--big-arm-only"]:
            big_arm_only = True
            arm_sup = int(self.args["--big-arm-only"])
        else:
            big_arm_only = False
            arm_sup = sup
        # Sanity check : Average mandatory if more than one file.
        if not self.args["--average"] and length_files > 1:
            logger.error("You have to average if more than one file.")
            sys.exit(1)
        # Iterate on the different file given by the user.
        for i in range(length_files):
            # Make the average if enabled
            if self.args["--average"]:
                xs[i], ps[i] = hcdl.average_distance_law(
                    xs[i], ps[i], arm_sup, big_arm_only
                )
                # If not average, we should to remove one level of list to have the good dimension.
        if not self.args["--average"]:
            names = names[0]
            xs = xs[0]
            ps = ps[0]
        # Normalize and make the derivative
        ps = hcdl.normalize_distance_law(xs, ps, inf, arm_sup)

        # Gave new names for the different samples.
        if self.args["--labels"]:
            labels = self.args["--labels"].split(",")
        else:
            if length_files == 1 and not self.args["--average"]:
                labels = []
                if len(names) > 1:
                    for chr_names in names:
                        labels.append(chr_names)
                else:
                    labels = names
            else:
                labels = []
                for i in range(length_files):
                    labels.append("Sample " + str(i))
                    
        # Make the plot if enabled, if not average plot the different arms or
        # chromosomes with the initial names else plot the different conditions
        # with the names labels.
        hcdl.plot_ps_slope(xs, ps, labels, output_file_img, inf, sup)

        # Export the new table if required.
        if self.args["--outputfile-tabl"]:
            hcdl.export_distance_law(
                xs,
                ps,
                labels, 
                self.args["--outputfile-tabl"]
            )


[docs]class Missview(AbstractCommand):
    """
    Previews bins that will be missing in a Hi-C map with a given read length by
    finding repetitive regions in the genome.

    usage:
        missview [--aligner=bowtie2] [--force] [--binning=5000]
                 [--threads=1] [--tmpdir=STR] --read-len=INT <genome> <output>

    arguments:
        genome               Genome file in fasta format.
        output               Path to the output image.

    options:
        -a, --aligner=STR    The read alignment software to use. Can be either
                             bowtie2, minimap2 or bwa. minimap2 should only be
                             used for reads > 100 bp. [default: bowtie2]
        -b, --binning=INT    Resolution to use to preview the Hi-C map. [default: 5000]
        -F, --force          Write even if the output file already exists.
        -R, --read-len=INT   Write even if the output file already exists.
        -t, --threads=INT    Number of CPUs to use in parallel. [default: 1]
        -T, --tmpdir=STR     Directory where temporary files will be generated.
    """

[docs]    def execute(self):
        aligner = self.args["--aligner"]
        force = self.args["--force"]
        genome = self.args["<genome>"]
        out = self.args["<output>"]
        resolution = parse_bin_str(self.args["--binning"])
        read_len = int(self.args["--read-len"])
        threads = int(self.args["--threads"])
        tmp_dir = self.args["--tmpdir"]
        if tmp_dir is None:
            tmp_dir = tempfile.TemporaryDirectory().name
        # Simulate reads and save into a fastq file
        phred = "F" * read_len
        tmp_fq = join(tmp_dir, "simulated_reads.fq")
        tmp_bam = join(tmp_dir, "simulated_reads.bam")
        self.check_output_path(tmp_fq, force=force)
        logger.info(
            "Simulating reads by splitting the genome into %i bp chunks",
            read_len,
        )
        with open(tmp_fq, "w") as fq_handle:
            for rec in SeqIO.parse(genome, "fasta"):
                for i in range(len(rec.seq) - read_len):
                    fq_handle.write("@NS_SIM_%s_%i\n" % (rec.id, i))
                    fq_handle.write(str(rec.seq[i : i + read_len]))
                    fq_handle.write("\n+\n" + phred + "\n")
        # Map reads to genome
        hpi.align_reads(
            tmp_fq,
            genome,
            tmp_bam,
            tmp_dir=tmp_dir,
            threads=threads,
            aligner=aligner,
        )
        # Sort alignments by name
        ps.sort(
            "-@",
            str(threads),
            "-n",
            "-O",
            "BAM",
            "-o",
            tmp_bam + ".sorted",
            tmp_bam,
        )
        shutil.move(tmp_bam + ".sorted", tmp_bam)
        # Run the standard pipeline with, using twice the forward reads.
        # This will generate a diagonal-only matrix
        hpi.full_pipeline(
            genome,
            tmp_bam,
            tmp_bam,
            start_stage="bam",
            aligner=aligner,
            enzyme=resolution,
            force=force,
            threads=threads,
            out_dir=tmp_dir,
            tmp_dir=tmp_dir,
        )
        # Plot and save matrix
        mat_path = join(tmp_dir, "abs_fragments_contacts_weighted.txt")
        mat = hio.load_sparse_matrix(mat_path)
        # Check which bins are not at the median (i.e. drop in mapping rate)
        log_content = open(glob.glob(join(tmp_dir, "*.log"))[0]).read()
        # Get (int rounded) percentage of reads mapped and convert to proportion
        prop_mapped = (
            int(re.search(r".*INFO :: ([0-9]*)% reads.*", log_content)[1])
            / 100
        )
        logger.info(
            "Bins with less than %s mapped reads will be considered undetectable",
            str(100 * prop_mapped) + "%",
        )
        unmappable = mat.diagonal(0) < prop_mapped * resolution
        mappable_mat = np.ones(mat.shape)
        mappable_mat[unmappable, :] = 0
        mappable_mat[:, unmappable] = 0
        hcv.plot_matrix(
            mappable_mat,
            filename=out,
            title=" %.3f%% missing bins for %s \nwith %i bp reads at resolution %i."
            % (
                round(100 * sum(unmappable) / len(unmappable), 3),
                os.path.basename(genome),
                read_len,
                resolution,
            ),
            dpi=600,
            vmax=2,
            cmap="Greys",
        )
        logger.info("Output image saved at %s.", out)


[docs]def parse_bin_str(bin_str):
    """Bin string parsing

    Take a basepair binning string as input and converts it into
    corresponding basepair values.

    Parameters
    ----------
    bin_str : str
        A basepair region (e.g. 150KB). Unit can be BP, KB, MB, GB.

    Example
    -------

        >>> parse_bin_str("150KB")
        150000
        >>> parse_bin_str("0.1mb")
        100000

    Returns
    -------
    binning : int
        The number of basepair corresponding to the binning string.
    """
    try:
        binning = int(bin_str)
    except ValueError:
        bin_str = bin_str.upper()
        binsuffix = {"B": 1, "K": 1000, "M": 1e6, "G": 1e9}
        unit_pos = re.search(r"[KMG]?B[P]?$", bin_str).start()
        bp_unit = bin_str[unit_pos:]
        # Extract unit and multiply accordingly for fixed bp binning
        binning = int(float(bin_str[:unit_pos]) * binsuffix[bp_unit[0]])

    return binning


[docs]def parse_ucsc(ucsc_str, bins):
    """
    Take a UCSC region in UCSC notation and a list of bin chromosomes and
    positions (in basepair) and converts it to range of bins.

    Parameters
    ----------
    ucsc_str : str
        The region string in UCSC notation (e.g. chr1:1000-2000)
    bins : pandas.DataFrame
        Dataframe of two columns containing the chromosome and start
        position of each bin. Each row must be one bin.

    Returns
    -------
    coord : tuple
        A tuple containing the bin range containing in the requested region.
    """
    if ":" in ucsc_str:
        chrom, bp = ucsc_str.split(":")
        bp = bp.replace(",", "").upper()
        start, end = bp.split("-")
        start, end = parse_bin_str(start), parse_bin_str(end)
        # Make absolute bin index (independent of chrom)
        bins["id"] = bins.index
        chrombins = bins.loc[bins.iloc[:, 0] == chrom, :]
        start = max([start, 1])
        start = max(chrombins.id[chrombins.iloc[:, 1] < start])
        end = max(chrombins.id[chrombins.iloc[:, 1] < end])
    else:
        chrom = ucsc_str
        # Make absolute bin index (independent of chrom)
        bins[2] = bins.index
        chrombins = bins.loc[bins.iloc[:, 0] == chrom, :]
        try:
            start = min(chrombins[2])
            end = max(chrombins[2])
        except ValueError:
            logger.error("Invalid chromosome")
            raise
    coord = (int(start), int(end))
    return coord