Source code for hicstuff.io

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import gzip
import zipfile
import bz2
import io
import os
import functools
import sys
import numpy as np
import pandas as pd
import collections
import subprocess as sp
from scipy.sparse import tril, triu
import pathlib
import re
from os.path import join, exists
from random import getrandbits
from scipy.sparse import coo_matrix
from Bio import SeqIO, SeqUtils
import hicstuff.hicstuff as hcs
from hicstuff.log import logger
from hicstuff.version import __version__
import scipy.stats as ss

DEFAULT_MAX_MATRIX_SHAPE = 10000

DEFAULT_FRAGMENTS_LIST_FILE_NAME = "fragments_list.txt"
DEFAULT_INFO_CONTIGS_FILE_NAME = "info_contigs.txt"
DEFAULT_SPARSE_MATRIX_FILE_NAME = "abs_fragments_contacts_weighted.txt"


def _cols_to_sparse(sparse_array, shape=None, dtype=np.float64):
    """
    Make a coordinate based sparse matrix from columns.
    Convert (3, n) shaped arrays to a sparse matrix. The first
    one acts as the row coordinates, the second one as the
    column coordinates, the third one as the data points
    for each pair. If duplicates are found, the data points are
    added.

    Parameters
    ----------
    sparse_array : array_like
        An array with exactly three columns representing the sparse
        matrix data in coordinate format.
    shape : tuple of int
        The total number of rows and columns in the matrix. Will be estimated
        from nonzero values if omitted.
    dtype : type, optional
        The type of data being loaded. Default is numpy.float64

    Example
    -------

        >>> import numpy as np
        >>> row, col = np.array([1, 2, 3]), np.array([3, 2, 1])
        >>> data = np.array([4, 5, 6])
        >>> M = np.array([row, col, data]).T
        >>> S = _cols_to_sparse(M)
        >>> print(S.todense())
        [[0. 0. 0. 0.]
         [0. 0. 0. 4.]
         [0. 0. 5. 0.]
         [0. 6. 0. 0.]]
    """

    row = sparse_array[:, 0]
    col = sparse_array[:, 1]
    data = sparse_array[:, 2]
    if shape is None:
        n = int(max(np.amax(row), np.amax(col))) + 1
        shape = (n, n)

    S = coo_matrix((data, (row, col)), shape=shape, dtype=dtype)
    return S


[docs]def load_sparse_matrix(mat_path, binning=1, dtype=np.float64):
    """Load sparse matrix

    Load a text file matrix into a sparse matrix object. The expected format is
    a 3 column file where columns are row_number, col_number, value. The first
    line consists of 3 values representing the total number of rows, columns
    and nonzero values.

    Parameters
    ----------
    mat_path : file, str or pathlib.Path
        The input matrix file in instagraal format.
    binning : int or "auto"
        The binning to perform. If "auto", binning will
        be automatically inferred so that the matrix size
        will not go beyond (10000, 10000) in shape. That
        can be changed by modifying the DEFAULT_MAX_MATRIX_SHAPE
        value. Default is 1, i.e. no binning is performed
    dtype : type, optional
        The type of data being loaded. Default is numpy.float64

    Returns
    -------
    sparse_mat : scipy.sparse.coo_matrix
        The output (sparse) matrix in COOrdinate format.

    Examples
    --------
    >>> S = load_sparse_matrix('test_data/mat_5kb.tsv', binning=1)
    >>> S.data[:10]
    array([84.,  2.,  3.,  2.,  1.,  1., 50.,  1., 66.,  1.])
    >>> S.shape
    (16, 16)
    """
    try:
        cols_arr = np.loadtxt(mat_path, delimiter="\t", dtype=dtype)
        shape = (int(cols_arr[0, 0]), int(cols_arr[0, 1]))
    except ValueError:
        cols_arr = np.loadtxt(
            mat_path, delimiter="\t", dtype=dtype, skiprows=1
        )
        shape = None

    # Get values into an array without the header. Use the header to give size.
    sparse_mat = _cols_to_sparse(cols_arr[1:, :], shape=shape, dtype=dtype)

    if binning == "auto":
        num_bins = max(sparse_mat.shape) + 1
        subsampling_factor = num_bins // DEFAULT_MAX_MATRIX_SHAPE
    else:
        subsampling_factor = binning
    sparse_mat = hcs.bin_sparse(
        sparse_mat, subsampling_factor=subsampling_factor
    )
    return sparse_mat


[docs]def save_sparse_matrix(s_mat, path):
    """Save a sparse matrix

    Saves a sparse matrix object into tsv format.

    Parameters
    ----------
    s_mat : scipy.sparse.coo_matrix
        The sparse matrix to save on disk
    path : str
        File path where the matrix will be stored
    """
    if s_mat.format != "coo":
        ValueError("Sparse matrix must be in coo format")
    dtype = s_mat.dtype
    fmt = "%i" if dtype == int else "%.10e"
    sparse_arr = np.vstack([s_mat.row, s_mat.col, s_mat.data]).T

    np.savetxt(
        path,
        sparse_arr,
        header="{nrows}\t{ncols}\t{nonzero}".format(
            nrows=s_mat.shape[0], ncols=s_mat.shape[1], nonzero=s_mat.nnz
        ),
        comments="",
        fmt=fmt,
        delimiter="\t",
    )


[docs]def load_pos_col(path, colnum, header=1, dtype=np.int64):
    """
    Loads a single column of a TSV file with header into a numpy array.
    
    Parameters
    ----------
    path : str
        The path of the TSV file to load.
    colnum : int
        The 0-based index of the column to load.
    header : int
        Number of line to skip. By default the header is a single line.

    Returns
    -------
    numpy.array :
        A 1D numpy array with the

    Examples
    --------
    >>> load_pos_col('test_data/mat_5kb.tsv', 0)[:10]
    array([0, 0, 0, 0, 0, 0, 1, 1, 2, 2])
    """
    pos_arr = np.genfromtxt(
        path,
        delimiter="\t",
        usecols=(colnum,),
        skip_header=header,
        dtype=dtype,
    )
    return pos_arr


[docs]def generate_temp_dir(path):
    """Temporary directory generation

    Generates a temporary file with a random name at the input path.
    
    Parameters
    ----------
    path : str
        The path at which the temporary directory will be created.
    
    Returns
    -------
    str
        The path of the newly created temporary directory.
    """
    exist = True
    while exist:
        # Keep trying random directory names if they already exist
        directory = str(hex(getrandbits(32)))[2:]
        full_path = join(path, directory)
        if not exists(full_path):
            exist = False
    try:
        os.makedirs(full_path)
    except PermissionError:
        raise PermissionError(
            "The temporary directory cannot be created in {}. "
            "Make sure you have write permission.".format(path)
        )
    return full_path


def _check_cooler(fun):
    """Decorates function `fun` to check if cooler is available.."""

    @functools.wraps(fun)
    def wrapped(*args, **kwargs):
        try:
            import cooler

            fun.__globals__["cooler"] = cooler
        except ImportError:
            logger.error(
                "The cooler package is required to use {0}, please install it first".format(
                    fun.__name__
                )
            )
            raise ImportError("The cooler package is required.")
        return fun(*args, **kwargs)

    return wrapped


[docs]@_check_cooler
def add_cool_column(
    clr, column, column_name, table_name="bins", metadata={}, dtype=None
):
    """
    Adds a new column to a loaded Cooler store. If the column exists,
    it is replaced. This will affect the .cool file.
    
    Parameters
    ----------
    clr : Cooler object
        A Cooler store.
    column : pandas Series
        The column to add to the cooler.
    column_name : str
        The name of the column to add.
    table_name : str
        The name of the table to which the column should be added.
        Defaults to the "bins" table.
    metadata : dict
        A dictionary of metadata to associate with the new column.
    """
    with clr.open("r+") as c:
        if column_name in c[table_name]:
            del c[table_name][column_name]
        h5opts = dict(compression="gzip", compression_opts=6)
        c[table_name].create_dataset(
            column_name, data=column, dtype=dtype, **h5opts
        )
        c[table_name][column_name].attrs.update(metadata)


[docs]@_check_cooler
def load_cool(cool):
    """
    Reads a cool file into memory and parses it into graal style tables.
    
    Parameters
    ----------
    cool : str
        Path to the input .cool file.

    Returns
    -------
    mat : scipy coo_matrix
        Hi-C contact map in COO format.
    frags : pandas DataFrame
        Table off bins matching the matrix. Corresponds to the content of the fragments_list.txt file.
    chroms : pandas DataFrame
        Table of chromosome informations.
    """
    c = cooler.Cooler(cool)  # pylint: disable=undefined-variable
    frags = c.bins()[:]
    chroms = c.chroms()[:]
    mat = c.pixels()[:]
    frags.rename(
        columns={"start": "start_pos", "end": "end_pos"}, inplace=True
    )
    frags["id"] = frags.groupby("chrom", sort=False).cumcount() + 1
    # Try loading hicstuff-specific columns
    try:
        frags = frags[
            ["id", "chrom", "start_pos", "end_pos", "size", "gc_content"]
        ]
    # If absent, only load standard columns
    except KeyError:
        frags = frags[["id", "chrom", "start_pos", "end_pos"]]

    chroms["cumul_length"] = (
        chroms.length.shift(1).fillna(0).cumsum().astype(int)
    )
    n_frags = c.bins()[:].groupby("chrom", sort=False).count().start
    chroms["n_frags"] = chroms.merge(
        n_frags, right_index=True, left_on="name", how="left"
    ).start
    chroms.rename(columns={"name": "contig"}, inplace=True)
    n = int(max(np.amax(mat.bin1_id), np.amax(mat.bin2_id))) + 1
    shape = (n, n)
    mat = coo_matrix((mat["count"], (mat.bin1_id, mat.bin2_id)), shape=shape)

    return mat, frags, chroms


[docs]@_check_cooler
def save_cool(cool_out, mat, frags, metadata={}):
    """
    Writes a .cool file from graal style tables.
    
    Parameters
    ----------
    cool_out : str
        Path to the output cool file.
    mat : scipy coo_matrix
        The Hi-C contact matrix in sparse COO format.
    frags : pandas DataFrame
        The graal style 'fragments_list' table.
    metadata : dict
        Potential metadata to associate with the cool file.
    """
    up_tri = False
    # Check if symmetric matrix is symmetric
    # (i.e. only upper triangle or full mat)
    if (abs(mat - mat.T) > 1e-10).nnz != 0:
        up_tri = True
    # Drop useless column
    try:
        bins = frags.drop("id", axis=1)
    except KeyError:
        bins = frags
    # Get column names right
    bins.rename(
        columns={"seq": "chrom", "start_pos": "start", "end_pos": "end"},
        inplace=True,
    )
    mat_dict = {"bin1_id": mat.row, "bin2_id": mat.col, "count": mat.data}
    pixels = pd.DataFrame(mat_dict)
    cooler.create_cooler(  # pylint: disable=undefined-variable
        cool_out,
        bins,
        pixels,
        metadata=metadata,
        symmetric_upper=up_tri,
        triucheck=False,
    )


[docs]def read_compressed(filename, mode='r'):
    """Read compressed file

    Opens the file in read mode with appropriate decompression algorithm.

    Parameters
    ----------
    filename : str
        The path to the input file

    Returns
    -------
    file-like object
        The handle to access the input file's content

    """

    # Standard header bytes for diff compression formats
    comp_bytes = {
        b"\x1f\x8b\x08": "gz",
        b"\x42\x5a\x68": "bz2",
        b"\x50\x4b\x03\x04": "zip",
    }

    max_len = max(len(x) for x in comp_bytes)

    def file_type(filename):
        """Guess file type

        Compare header bytes with those in the file and return type.
        """
        with open(filename, "rb") as f:
            file_start = f.read(max_len)
        for magic, filetype in comp_bytes.items():
            if file_start.startswith(magic):
                return filetype
        return "uncompressed"

    # Open file with appropriate function
    mode_map = {'r': 'rt', 'rb': 'rb'}
    comp = file_type(filename)
    if comp == "gz":
        return gzip.open(filename, mode_map[mode])
    elif comp == "bz2":
        return bz2.open(filename, mode_map[mode])
    elif comp == "zip":
        zip_arch = zipfile.ZipFile(filename, mode)
        if len(zip_arch.namelist()) > 1:
            raise IOError(
                "Only a single fastq file must be in the zip archive."
            )
        else:
            # ZipFile opens as bytes by default, using io to read as text
            zip_content = zip_arch.open(zip_arch.namelist()[0], mode)
            return io.TextIOWrapper(zip_content, encoding="utf-8")
    else:
        return open(filename, mode)


[docs]def is_compressed(filename):
    """Check compression status

    Check if the input file is compressed from the first bytes.

    Parameters
    ----------
    filename : str
        The path to the input file

    Returns
    -------
    bool
        True if the file is compressed, False otherwise.
    """

    # Standard header bytes for diff compression formats
    comp_bytes = {
        b"\x1f\x8b\x08": "gz",
        b"\x42\x5a\x68": "bz2",
        b"\x50\x4b\x03\x04": "zip",
    }
    max_len = max(len(x) for x in comp_bytes)
    with open(filename, "rb") as f:
        file_start = f.read(max_len)
    for magic, _ in comp_bytes.items():
        if file_start.startswith(magic):
            return True
    return False


[docs]def from_dade_matrix(filename, header=False):
    """Load a DADE matrix

    Load a numpy array from a DADE matrix file, optionally
    returning bin information from the header. Header data
    processing is delegated downstream.

    Parameters
    ----------
    filename : str, file or pathlib.Path
        The name of the file containing the DADE matrix.
    header : bool
        Whether to return as well information contained
        in the header. Default is False.

    Example
    -------
        >>> import numpy as np
        >>> import tempfile
        >>> lines = [['RST', 'chr1~0', 'chr1~10', 'chr2~0', 'chr2~30'],
        ...          ['chr1~0', '5'],
        ...          ['chr1~10', '8', '3'],
        ...          ['chr2~0', '3', '5', '5'],
        ...          ['chr2~30', '5', '10', '11', '2']
        ...          ]
        >>> formatted = ["\\t".join(l) + "\\n" for l in lines ]
        >>> dade = tempfile.NamedTemporaryFile(mode='w')
        >>> for fm in formatted:
        ...     dade.write(fm)
        34
        9
        12
        13
        18
        >>> dade.flush()
        >>> M, h = from_dade_matrix(dade.name, header=True)
        >>> dade.close()
        >>> print(M)
        [[ 5.  8.  3.  5.]
         [ 8.  3.  5. 10.]
         [ 3.  5.  5. 11.]
         [ 5. 10. 11.  2.]]

        >>> print(h)
        ['chr1~0', 'chr1~10', 'chr2~0', 'chr2~30']

    See https://github.com/scovit/DADE for more details about Dade.
    """

    A = pd.read_csv(filename, sep="\t", header=None)
    A.fillna("0", inplace=True)
    M, headers = np.array(A.iloc[1:, 1:], dtype=np.float64), A.iloc[0, :]
    matrix = M + M.T - np.diag(np.diag(M))
    if header:
        return matrix, headers.tolist()[1:]
    else:
        return matrix


[docs]def to_dade_matrix(M, annotations="", filename=None):
    """Returns a Dade matrix from input numpy matrix. Any annotations are added
    as header. If filename is provided and valid, said matrix is also saved
    as text.
    """

    n, m = M.shape
    A = np.zeros((n + 1, m + 1))
    A[1:, 1:] = M
    if not annotations:
        annotations = np.array(["" for _ in n], dtype=str)
    A[0, :] = annotations
    A[:, 0] = annotations.T
    if filename:
        try:
            np.savetxt(filename, A, fmt="%i")
            logger.info(
                "I saved input matrix in dade format as {0}".format(
                    str(filename)
                )
            )
        except ValueError as e:
            logger.warning("I couldn't save input matrix.")
            logger.warning(str(e))

    return A


[docs]def load_into_redis(filename):
    """Load a file into redis

    Load a matrix file and sotre it in memory with redis.
    Useful to pass around huge datasets from scripts to
    scripts and load them only once.

    Inspired from https://gist.github.com/alexland/ce02d6ae5c8b63413843

    Parameters
    ----------
    filename : str, file or pathlib.Path
        The file of the matrix to load.

    Returns
    -------
    key : str
        The key of the dataset needed to retrieve it from redis.
    """
    try:
        from redis import StrictRedis as redis
        import time
    except ImportError:
        print(
            "Error! Redis does not appear to be installed in your system.",
            file=sys.stderr,
        )
        exit(1)

    M = np.genfromtxt(filename, dtype=None)
    array_dtype = str(M.dtype)
    m, n = M.shape
    M = M.ravel().tostring()
    database = redis(host="localhost", port=6379, db=0)
    key = "{0}|{1}#{2}#{3}".format(int(time.time()), array_dtype, m, n)

    database.set(key, M)

    return key


[docs]def load_from_redis(key):
    """Retrieve a dataset from redis

    Retrieve a cached dataset that was stored in redis
    with the input key.

    Parameters
    ----------
    key : str
        The key of the dataset that was stored in redis.

    Returns
    -------
    M : numpy.ndarray
        The retrieved dataset in array format.
    """

    try:
        from redis import StrictRedis as redis
    except ImportError:
        print(
            "Error! Redis does not appear to be installed in your system.",
            file=sys.stderr,
        )
        exit(1)

    database = redis(host="localhost", port=6379, db=0)

    try:
        M = database.get(key)
    except KeyError:
        print(
            "Error! No dataset was found with the supplied key.",
            file=sys.stderr,
        )
        exit(1)

    array_dtype, n, m = key.split("|")[1].split("#")

    M = np.fromstring(M, dtype=array_dtype).reshape(int(n), int(m))
    return M


[docs]def dade_to_graal(
    filename,
    output_matrix=DEFAULT_SPARSE_MATRIX_FILE_NAME,
    output_contigs=DEFAULT_INFO_CONTIGS_FILE_NAME,
    output_frags=DEFAULT_SPARSE_MATRIX_FILE_NAME,
    output_dir=None,
):
    """Convert a matrix from DADE format (https://github.com/scovit/dade)
    to a graal-compatible format. Since DADE matrices contain both fragment
    and contact information all files are generated at the same time.
    """

    with open(output_matrix, "w") as sparse_file:
        sparse_file.write("id_frag_a\tid_frag_b\tn_contact")
        with open(filename) as file_handle:
            first_line = file_handle.readline()
            for row_index, line in enumerate(file_handle):
                dense_row = np.array(line.split("\t")[1:], dtype=np.int32)
                for col_index in np.nonzero(dense_row)[0]:
                    line_to_write = "{}\t{}\t{}\n".format(
                        row_index, col_index, dense_row[col_index]
                    )
                    sparse_file.write(line_to_write)

    header = first_line.split("\t")
    bin_type = header[0]
    if bin_type == '"RST"':
        logger.info("I detected fragment-wise binning")
    elif bin_type == '"BIN"':
        logger.info("I detected fixed size binning")
    else:
        logger.warning(
            (
                "Sorry, I don't understand this matrix's "
                "binning: I read {}".format(str(bin_type))
            )
        )

    header_data = [
        header_elt.replace("'", "")
        .replace('"', "")
        .replace("\n", "")
        .split("~")
        for header_elt in header[1:]
    ]

    (
        global_frag_ids,
        contig_names,
        local_frag_ids,
        frag_starts,
        frag_ends,
    ) = np.array(list(zip(*header_data)))

    frag_starts = frag_starts.astype(np.int32) - 1
    frag_ends = frag_ends.astype(np.int32) - 1
    frag_lengths = frag_ends - frag_starts

    total_length = len(global_frag_ids)

    with open(output_contigs, "w") as info_contigs:

        info_contigs.write("contig\tlength\tn_frags\tcumul_length\n")

        cumul_length = 0

        for contig in collections.OrderedDict.fromkeys(contig_names):

            length_tig = np.sum(frag_lengths[contig_names == contig])
            n_frags = collections.Counter(contig_names)[contig]
            line_to_write = "%s\t%s\t%s\t%s\n" % (
                contig,
                length_tig,
                n_frags,
                cumul_length,
            )
            info_contigs.write(line_to_write)
            cumul_length += n_frags

    with open(output_frags, "w") as fragments_list:

        fragments_list.write(
            "id\tchrom\tstart_pos\tend_pos" "\tsize\tgc_content\n"
        )
        bogus_gc = 0.5

        for i in range(total_length):
            line_to_write = "%s\t%s\t%s\t%s\t%s\t%s\n" % (
                int(local_frag_ids[i]) + 1,
                contig_names[i],
                frag_starts[i],
                frag_ends[i],
                frag_lengths[i],
                bogus_gc,
            )
            fragments_list.write(line_to_write)


[docs]def load_bedgraph2d(filename, bin_size=None, fragments_file=None):
    """
    Loads matrix and fragment information from a 2D bedgraph file. Note this
    function assumes chromosomes are ordered in alphabetical. order
    
    Parameters
    ----------
    filename : str
        Path to the bedgraph2D file.
    bin_size : int
        The size of bins in the case of fixed bin size.
    fragments_file : str
        Path to a fragments file to explicitely provide fragments positions.
        If the matrix does not have fixed bin size, this prevents errors.
    
    Returns
    -------
    mat : scipy.sparse.coo_matrix
        The Hi-C contact map as the upper triangle of a symetric matrix, in
        sparse format.
    frags : pandas.DataFrame
        The list of fragments/bin present in the matrix with their genomic
        positions.
    """
    bed2d = pd.read_csv(filename, sep="\t", header=None)
    chrom_sizes = {}
    if bin_size is not None:
        # If bin size if provided, retrieve chromosome lengths, this will be
        # used when regenerating bin coordinates
        chroms_left = bed2d[[3, 5]]
        chroms_left.columns = [0, 2]
        chroms = (
            pd.concat([bed2d[[0, 2]], chroms_left])
            .groupby([0], sort=False)
            .max()
        )
        for chrom, size in zip(chroms.index, np.array(chroms)):
            chrom_sizes[chrom] = size[0]
    elif fragments_file is None:
        logger.warning(
            "Please be aware that not all information can be restored from a "
            "bg2 file without fixed bin size; fragments without any contact "
            "will be lost"
        )
    # Get all possible fragment chrom-positions into an array
    frag_pos = np.vstack(
        [np.array(bed2d[[0, 1, 2]]), np.array(bed2d[[3, 4, 5]])]
    )
    # Sort by position (least important, col 1)
    frag_pos = frag_pos[frag_pos[:, 1].argsort(kind="mergesort")]
    # Then by chrom (most important, col 0)
    frag_pos = frag_pos[frag_pos[:, 0].argsort(kind="mergesort")]
    # Get unique names for fragments (chrom+pos)
    ordered_frag_pos = (
        pd.DataFrame(frag_pos).drop_duplicates().reset_index(drop=True)
    )
    frag_pos_a = bed2d[[0, 1]].apply(lambda x: tuple(x), axis=1)
    frag_pos_b = bed2d[[3, 4]].apply(lambda x: tuple(x), axis=1)
    # If fragments file is provided, use fragments positions to indices mapping
    if fragments_file is not None:
        frags = pd.read_csv(fragments_file, delimiter="\t")
        frag_map = frags.apply(lambda x: (str(x.chrom), x.start_pos), axis=1)
        frag_map = {f_name: f_idx for f_idx, f_name in enumerate(frag_map)}
    # If fixed fragment size available, use it to reconstruct original
    # fragments ID (even if they are absent from the bedgraph file).
    elif bin_size is not None:
        frag_map = {}
        chrom_frags = []
        for chrom, size in chrom_sizes.items():
            prev_frags = len(frag_map)
            for bin_id, bin_pos in enumerate(range(0, size, bin_size)):
                frag_map[(chrom, bin_pos)] = bin_id + prev_frags
            n_bins = size // bin_size
            chrom_frags.append(
                pd.DataFrame(
                    {
                        "id": range(1, n_bins + 1),
                        "chrom": np.repeat(chrom, n_bins),
                        "start_pos": range(0, size, bin_size),
                        "end_pos": range(bin_size, size + bin_size, bin_size),
                    }
                )
            )
        frags = pd.concat(chrom_frags, axis=0).reset_index(drop=True)
        frags.insert(
            loc=3, column="size", value=frags.end_pos - frags.start_pos
        )
    # If None available, guess fragments indices from bedgraph (potentially wrong)
    else:
        frag_map = {
            (v[0], v[1]): i
            for i, v in ordered_frag_pos.iloc[:, [0, 1]].iterrows()
        }
        frags = ordered_frag_pos.copy()
        frags[3] = frags.iloc[:, 2] - frags.iloc[:, 1]
        frags.insert(loc=0, column="id", value=0)
        frags.id = frags.groupby([0], sort=False).cumcount() + 1
        frags.columns = ["id", "chrom", "start_pos", "end_pos", "size"]
    # Match bin indices to their names
    frag_id_a = np.array(list(map(lambda x: frag_map[x], frag_pos_a)))
    frag_id_b = np.array(list(map(lambda x: frag_map[x], frag_pos_b)))
    contacts = np.array(bed2d.iloc[:, 6].tolist())
    # Use index to build matrix
    n_frags = len(frag_map.keys())
    mat = coo_matrix(
        (contacts, (frag_id_a, frag_id_b)), shape=(n_frags, n_frags)
    )

    # Get size of each chromosome in basepairs
    chromsizes = frags.groupby("chrom", sort=False).apply(
        lambda x: np.int64(max(x.end_pos))
    )
    chrom_bins = frags.groupby("chrom", sort=False).size()
    # Shift chromsizes by one to get starting bin, first one is zero
    # Make chromsize cumulative to get start bin of each chrom
    # Get chroms into a 1D array of bin starts
    chrom_start = chrom_bins.shift(1, fill_value=0).cumsum()
    chroms = pd.DataFrame(
        {
            "contig": chromsizes.index,
            "length": chromsizes.values,
            "n_frags": chrom_bins,
            "cumul_length": chrom_start,
        }
    )
    return mat, frags, chroms


[docs]def flexible_hic_loader(
    mat, fragments_file=None, chroms_file=None, quiet=False
):
    """
    Wrapper function to load COO, bg2 or cool input and return the same output.
    COO formats requires fragments_file and chroms_file options. bg2 format can
    infer bin_size if fixed. When providing a bg2 matrix with uneven fragments
    length, one should provide fragments_file as well or empty bins will be
    truncated from the output.
    
    Parameters
    ----------
    mat : str
        Path to the matrix in graal, bedgraph2 or cool format.
    fragments_file : str or None
        Path to the file with fragments information (fragments_list.txt).
        Only required if the matrix is in graal format.
    chroms_file : str or None
        Path to the file with chromosome information (info_contigs.txt). Only required
        if the matrix is in graal format.
    quiet : bool
        If True, will silence warnings for empty outputs.

    Returns
    -------
    mat : scipy.sparse.coo_matrix
        Sparse upper triangle Hi-C matrix.
    frags : pandas.DataFrame or None
        Table of fragment informations. None if information was not provided.
    chroms : pandas.DataFrame or None
        Table of chromosomes/contig information. None if information was not provided.
    """
    hic_format = get_hic_format(mat)
    # Load cool based on file extension
    if hic_format == "cool":
        mat, frags, chroms = load_cool(mat)
    # Use the first line to determine COO / bg2 format
    if hic_format == "bg2":
        # Use the frags file to define bins if available
        if fragments_file is not None:
            mat, frags, chroms = load_bedgraph2d(
                mat, fragments_file=fragments_file
            )
        else:
            # Guess if bin size is fixed based on MAD
            bg2 = pd.read_csv(mat, sep="\t")
            sizes = np.array(bg2.iloc[:, 2] - bg2.iloc[:, 1])
            size_mad = ss.median_abs_deviation(sizes, scale='normal')
            # Use only the bg2
            if size_mad > 0:
                mat, frags, chroms = load_bedgraph2d(mat)
                logger.warning(
                    "Input is a bedgraph2d file with uneven bin size, "
                    "but no fragments_file was provided. Empty bins will "
                    "be missing from the output. To avoid this, provide a "
                    "fragments file."
                )
            # Use fixed bin size
            else:
                mat, frags, chroms = load_bedgraph2d(
                    mat, bin_size=int(np.median(sizes))
                )

    elif hic_format == "graal":
        mat = load_sparse_matrix(mat)
        try:
            frags = pd.read_csv(fragments_file, sep="\t")
        except ValueError:
            if not quiet:
                logger.warning(
                    "fragments_file was not provided when "
                    "loading a matrix in COO/graal format. frags will be None."
                )
            frags = None
        try:
            chroms = pd.read_csv(chroms_file, sep="\t")
        except ValueError:
            if not quiet:
                logger.warning(
                    "chroms_file was not provided when "
                    "loading a matrix in COO/graal format. chroms will be None."
                )

            chroms = None

    # Ensure the matrix is upper triangle symmetric
    if mat.shape[0] == mat.shape[1]:
        if (abs(mat - mat.T) > 1e-10).nnz > 0:
            mat = mat + tril(mat, k=-1).T
        mat = triu(mat, format="coo")

    return mat, frags, chroms


[docs]def get_hic_format(mat):
    """Returns the format of the input Hi-C matrix
    
    Parameters
    ----------
    mat : str
        Path to the input Hi-C matrix
    Returns
    -------
    str :
        Hi-C format string. One of graal, bg2, cool
    """
    if (
        mat.endswith(".cool")
        or mat.count(".mcool::/") == 1
        or mat.count(".cool::/") == 1
    ):
        hic_format = "cool"
    else:
        # Use the first line to determine COO / bg2 format
        ncols = len(open(mat).readline().split("\t"))
        if ncols == 7:
            hic_format = "bg2"
        elif ncols == 3:
            hic_format = "graal"
        else:
            raise ValueError("Unkown file format")
    return hic_format


[docs]def flexible_hic_saver(
    mat, out_prefix, frags=None, chroms=None, hic_fmt="graal", quiet=False,
):
    """
    Saves objects to the desired Hi-C file format. 

    Parameters
    ----------
    mat : scipy.sparse.coo_matrix
        Hi-C contact map.
    out_prefix : str
        Output path without extension (the extension is added based on hic_fmt).
    frags : pandas.DataFrame or None
        Table of fragments informations.
    chroms : pandas.DataFrame or None
        Table of chromosomes / contigs informations.
    hic_fmt : str
        Output format. Can be one of graal for graal-compatible COO format, bg2 for
        2D bedgraph format, or cool for cooler compatible format.
    """
    if hic_fmt == "graal":
        save_sparse_matrix(mat, out_prefix + ".mat.tsv")
        try:
            frags.to_csv(out_prefix + ".frag.tsv", sep="\t", index=False)
        except AttributeError:
            if not quiet:
                logger.warning(
                    "Could not create fragments_list.txt from input files"
                )
        try:
            chroms.to_csv(out_prefix + ".chr.tsv", sep="\t", index=False)
        except AttributeError:
            if not quiet:
                logger.warning(
                    "Could not create info_contigs.txt from input files"
                )
    elif hic_fmt == "cool":
        frag_sizes = frags.end_pos - frags.start_pos
        size_mad = np.median(frag_sizes - np.median(frag_sizes))
        bin_type = "variable" if size_mad else "fixed"
        try:
            save_cool(
                out_prefix + ".cool",
                mat,
                frags,
                metadata={"hicstuff": __version__, "bin-type": bin_type},
            )
        except NameError:
            NameError("frags is required to save a cool file")
    elif hic_fmt == "bg2":
        try:
            save_bedgraph2d(mat, frags, out_prefix + ".bg2")
        except NameError:
            NameError("frags is required to save a bg2 file")
    else:
        raise ValueError("Unknown output format: {0}".format(hic_fmt))


[docs]def save_bedgraph2d(mat, frags, out_path):
    """
    Given a sparse matrix and a corresponding list of fragments, save a file
    in 2D bedgraph format.

    Parameters
    ----------
    mat : scipy.sparse.coo_matrix
        The sparse contact map.
    frags : pandas.DataFrame
        A structure containing the annotations for each matrix bin. Should
        correspond to the content of the fragments_list.txt file.

    """

    mat_df = pd.DataFrame(
        {"row": mat.row, "col": mat.col, "data": mat.data.astype(int)}
    )
    # Merge fragments with matrix based on row indices to annotate rows
    merge_mat = mat_df.merge(
        frags, left_on="row", right_index=True, how="left", suffixes=("", "")
    )
    # Rename annotations to assign to frag1
    merge_mat.rename(
        columns={"chrom": "chr1", "start_pos": "start1", "end_pos": "end1"},
        inplace=True,
    )
    # Do the same operation for cols (frag2)
    merge_mat = merge_mat.merge(
        frags,
        left_on="col",
        right_index=True,
        how="left",
        suffixes=("_0", "_2"),
    )
    merge_mat.rename(
        columns={"chrom": "chr2", "start_pos": "start2", "end_pos": "end2"},
        inplace=True,
    )
    # Select only relevant columns in correct order
    bg2 = merge_mat.loc[
        :, ["chr1", "start1", "end1", "chr2", "start2", "end2", "data"]
    ]
    bg2.to_csv(out_path, header=None, index=False, sep="\t")


[docs]def get_pos_cols(df):
    """Get column names representing chromosome, start and end column
    from a dataframe. Allows flexible names.

    Parameters
    ----------
    df : pd.DataFrame

    Returns
    -------
    tuple of str:
        Tuple containing the names of the chromosome, start and end
        columns in the input dataframe.

    Examples
    --------
    >>> import pandas as pd
    >>> d = [1, 2, 3]
    >>> df = pd.DataFrame(
    ...     {'Chromosome': d, 'Start': d, 'End': d, 'species': d}
    ... )
    >>> get_pos_cols(df)
    ('Chromosome', 'Start', 'End')
    >>> df = pd.DataFrame(
    ...     {'id': d, 'chr': d, 'start_bp': d, 'end_bp': d}
    ... )
    >>> get_pos_cols(df)
    ('chr', 'start_bp', 'end_bp')
    """

    # Get case insensitive column names
    cnames = df.columns
    inames = cnames.str.lower()

    def _col_getter(cols, pat):
        """Helper to get column index from the start of its name"""
        mask = cols.str.startswith(pat)
        idx = np.flatnonzero(mask)[0]
        return idx

    chrom_col = cnames[_col_getter(inames, "chr")]
    start_col = cnames[_col_getter(inames, "start")]
    end_col = cnames[_col_getter(inames, "end")]
    return chrom_col, start_col, end_col


[docs]def gc_bins(genome_path, frags):
    """Generate GC content annotation for bins using input genome.

    Parameters
    ----------
    genome_path : str
        Path the the genome file in FASTA format.
    frags : pandas.DataFrame
        Table containing bin segmentation of the genome.
        Required columns: chrom, start, end.

    Returns
    -------
    numpy.ndarray of floats:
        GC content per bin, in the range [0.0, 1.0].
    """
    # Grab columns by name (catch start, Start, star_pos, etc)
    chrom_col, start_col, end_col = get_pos_cols(frags)
    # Fill the gc array by chromosome
    gc_bins = np.zeros(frags.shape[0], dtype=float)
    for rec in SeqIO.parse(genome_path, "fasta"):
        mask = frags[chrom_col] == rec.id
        # Define coordinates of each bin
        starts = frags.loc[mask, start_col]
        ends = frags.loc[mask, end_col]
        # Slice chromosome sequence based on bins
        seqs = [str(rec.seq)[s:e] for s, e in zip(starts, ends)]
        # Fill GC values for bins in the chromosome
        idx = np.flatnonzero(mask)
        gc_bins[idx] = np.array(list(map(SeqUtils.GC, seqs))) / 100.0

    return gc_bins


[docs]def sort_pairs(in_file, out_file, keys, tmp_dir=None, threads=1, buffer="2G"):
    """
    Sort a pairs file in batches using UNIX sort.

    Parameters
    ----------
    in_file : str
        Path to the unsorted input file
    out_file : str
        Path to the sorted output file.
    keys : list of str
        list of columns to use as sort keys. Each column can be one of readID,
        chr1, pos1, chr2, pos2, frag1, frag2. Key priorities are according to
        the order in the list.
    tmp_dir : str
        Path to the directory where temporary files will be created. Defaults
        to current directory.
    threads : int
        Number of parallel sorting threads.
    buffer : str
        Buffer size used for sorting. Consists of a number and a unit.
    """
    # TODO: Write a pure python implementation to drop GNU coreutils depencency,
    # could be inspired from: https://stackoverflow.com/q/14465154/8440675

    # Check if UNIX sort version supports parallelism
    parallel_ok = True
    sort_ver = sp.Popen(["sort", "--version"], stdout=sp.PIPE)
    sort_ver = (
        sort_ver.communicate()[0]
        .decode()
        .split("\n")[0]
        .split(" ")[-1]
        .split(".")
    )
    # If so, specify threads, otherwise don't mention it in the command line
    try:
        sort_ver = list(map(int, sort_ver))
        if sort_ver[0] < 8 or (sort_ver[0] == 8 and sort_ver[1] < 23):
            logger.warning(
                "GNU sort version is {0} but >8.23 is required for parallel "
                "sort. Sorting on a single thread.".format(
                    ".".join(map(str, sort_ver))
                )
            )
            parallel_ok = False
    # BSD sort has a different format and will throw error upon parsing. It does
    # not support parallel processes anyway.
    except ValueError:
        logger.warning(
            "Using BSD sort instead of GNU sort, sorting on a single thread."
        )
        parallel_ok = False

    key_map = {
        "readID": "-k1,1d",
        "chr1": "-k2,2V",
        "pos1": "-k3,3n",
        "chr2": "-k4,4V",
        "pos2": "-k5,5n",
        "strand1": "-k6,6d",
        "strand2": "-k7,7d",
        "frag1": "-k8,8n",
        "frag2": "-k9,9n",
    }

    # transform column names to corresponding sort keys
    try:
        sort_keys = map(lambda k: key_map[k], keys)
    except KeyError:
        print("Unkown column name.")
        raise
    # Rewrite header with new sorting order
    header = get_pairs_header(in_file)
    with open(out_file, "w") as output:
        for line in header:
            if line.startswith("#sorted"):
                output.write("#sorted: {0}\n".format("-".join(keys)))
            else:
                output.write(line + "\n")

    # Sort pairs and append to file.
    with open(out_file, "a") as output:
        grep_proc = sp.Popen(["grep", "-v", "^#", in_file], stdout=sp.PIPE)
        sort_cmd = ["sort", "-S %s" % buffer] + list(sort_keys)
        if tmp_dir is not None:
            sort_cmd.append("--temporary-directory={0}".format(tmp_dir))
        if parallel_ok:
            sort_cmd.append("--parallel={0}".format(threads))
        sort_proc = sp.Popen(sort_cmd, stdin=grep_proc.stdout, stdout=output)
        sort_proc.communicate()


[docs]def get_pairs_header(pairs):
    r"""Retrieves the header of a .pairs file and stores lines into a list.

    Parameters
    ----------
    pairs : str or file object
        Path to the pairs file.

    Returns
    -------
    header : list of str
        A list of header lines found, in the same order they appear in pairs.

    Examples
    --------
        >>> import os
        >>> from tempfile import NamedTemporaryFile
        >>> p = NamedTemporaryFile('w', delete=False)
        >>> p.writelines(["## pairs format v1.0\n", "#sorted: chr1-chr2\n", "abcd\n"])
        >>> p.close()
        >>> h = get_pairs_header(p.name)
        >>> for line in h:
        ...     print([line])
        ['## pairs format v1.0']
        ['#sorted: chr1-chr2']
        >>> os.unlink(p.name)
    """
    # Open file if needed
    with open(pairs, "r") as pairs:
        # Store header lines into a list
        header = []
        line = pairs.readline()
        while line.startswith("#"):
            header.append(line.rstrip())
            line = pairs.readline()

    return header


[docs]def reorder_fasta(genome, output=None, threshold=100000):
    """Reorder and trim a fasta file

    Sort a fasta file by record lengths, optionally trimming the smallest ones.


    Parameters
    ----------
    genome : str, file or pathlib.Path
        The genome scaffold file (or file handle)
    output : str, file or pathlib.Path
        The output file to write to
    threshold : int, optional
        The size below which scaffolds are discarded, by default 100000
    """

    if output is None:
        output = "{}_renamed.fa".format(genome.split(".")[0])

    handle = SeqIO.parse(genome, "fasta")
    handle_to_write = sorted(
        (len(u) for u in handle if len(u) > threshold), reverse=True
    )
    SeqIO.write(handle_to_write, output, "fasta")


[docs]def rename_genome(genome, output=None, ambiguous=True):
    """Rename genome and slugify headers

    Rename genomes according to a simple naming scheme; this
    is mainly done to avoid special character weirdness.

    Parameters
    ----------
    genome : file, str or pathlib.Path
        The input genome to be renamed and slugify.
    output : file, str or pathlib.Path
        The output genome to be written into. Default is <base>_renamed.fa,
        where <base> is genome_in without its extension.
    ambiguous : bool
        Whether to retain ambiguous non-N bases, otherwise rename them to Ns.
        Default is True.
    """

    if output is None:
        output = "{}_renamed.fa".format(genome.split(".")[0])

    with open(output, "w") as output_handle:
        for record in SeqIO.parse(genome, "fasta"):

            # Replace hyphens, tabs and whitespace with underscores
            new_record_id = record.id.replace(" ", "_")
            new_record_id = new_record_id.replace("-", "_")
            new_record_id = new_record_id.replace("\t", "_")

            # Remove anything that's weird, i.e. not alphanumeric
            # or an underscore
            new_record_id = re.sub("[^_A-Za-z0-9]+", "", new_record_id)
            header = ">{}\n".format(new_record_id)

            new_seq = re.sub("[^ATGCatgcNn]", "N", str(record.seq))

            output_handle.write(header)
            output_handle.write("{}\n".format(new_seq))


[docs]def check_fasta_index(ref, mode="bowtie2"):
    """
    Checks for the existence of a bowtie2 or bwa index based on the reference
    file name.

    Parameters
    ----------
    ref : str
        Path to the reference genome.
    mode : str
        The alignment software used to build the index. bowtie2 or bwa. If any
        other value is given, the function returns the reference path.

    Returns
    -------
    index : str
        The bowtie2 or bwa index basename. None if no index was found
    """
    ref = pathlib.Path(ref)
    if mode == "bowtie2":
        # Bowtie2 should have 6 index files
        bt2_idx_files = list(ref.parent.glob("{}*bt2*".format(ref.name)))
        index = None if len(bt2_idx_files) < 6 else bt2_idx_files
    elif mode == "bwa":
        refdir = str(ref.parent)
        refdir_files = os.listdir(refdir)
        bwa_idx_files = [
            join(refdir, f)
            for f in refdir_files
            if re.search(r".*\.(sa|pac|bwt|ann|amb)$", f)
        ]
        index = None if len(bwa_idx_files) < 5 else bwa_idx_files
    else:
        index = [ref]
    if index is not None:
        # Convert the PosixPath objects to strings and get the longest common prefix to obtain
        # index basename (without the dot)
        index = os.path.commonprefix(list(map(str, index))).strip(".")
    return index


[docs]def check_is_fasta(in_file):
    """
    Checks whether input file is in fasta format.
    
    Parameters
    ----------
    in_file : str
        Path to the input file.

    Returns
    -------
    bool :
        True if the input is in fasta format, False otherwise
    """
    try:
        with read_compressed(in_file) as handle:
            fasta = any(SeqIO.parse(handle, "fasta"))
    except FileNotFoundError:
        fasta = False

    return fasta