Source code for hicstuff.view

#!/usr/bin/env python3
# coding: utf-8

"""Hi-C visualization

A lightweight library for quickly parsing, loading and
viewing contact maps in instagraal or csv format.
"""


import numpy as np
from matplotlib import pyplot as plt
from matplotlib import colors
from Bio import SeqIO
import hicstuff.io as hio
from hicstuff.hicstuff import normalize_sparse

SEABORN = False

try:
    import seaborn as sns

    SEABORN = True
except ImportError:
    pass



load_sparse_matrix = hio.load_sparse_matrix


[docs]def sparse_to_dense(M, remove_diag=True): """ Converts a sparse square matrix into a full dense matrix. Removes the diagonal by default. Parameters ---------- M : scipy.sparse.coo_matrix A sparse representation of the matrix. remove_diag : bool Whether the diagonal Returns ------- numpy.array : The matrix in dense format. Example ------- >>> import numpy as np >>> from scipy.sparse import coo_matrix >>> row, col = np.array([1, 2, 3]), np.array([3, 2, 1]) >>> data = np.array([4, 5, 6]) >>> S = coo_matrix((data, (row, col))) >>> M = sparse_to_dense(S, remove_diag=True) >>> for u in M: ... print(u) [0 0 0 0] [ 0 0 0 10] [0 0 0 0] [ 0 10 0 0] """ sub_diag = 2 if remove_diag else 1 D = M.toarray() E = D + np.transpose(D) - sub_diag * np.diag(np.diag(D)) return np.array(E)
[docs]def plot_matrix( array, ax=None, filename=None, vmin=0, vmax=None, title=None, dpi=500, cmap="Reds", chrom_starts=None ): """A function that performs all the tedious matplotlib magic to draw a 2D array with as few parameters and as little whitespace as possible. Adjusted from https://github.com/koszullab/metaTOR Parameters ---------- array : numpy.array The input (dense) matrix that must be plotted. ax : matplotlib.AxesSubplot An optional axis on which to draw the plot. Useful for multipanel images. filename : str The filename where the image should be stored. If None, the image is shown interactively. vmin : int The minimum value on the colorscale. vmax : int or None The maximum value on the colorscale. If None, the 99th percentile is taken. title : str The string to display as figure title. dpi : int The DPI (i.e. resolution in dots per inch) of the output image. cmap : str The name of the matplotlib colormap to use when plotting the matrix. chrom_starts : numpy.array Array of bin positions where to draw chromosome starts as dotted lines. """ if vmax is None: vmax = np.percentile(array, 99) # plt.gca().set_axis_off() # plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0) # plt.margins(0, 0) # plt.gca().xaxis.set_major_locator(plt.NullLocator()) # plt.gca().yaxis.set_major_locator(plt.NullLocator()) im_kwargs = {'vmin': vmin, 'vmax': vmax, 'cmap': cmap, 'interpolation': "none"} li_kwargs = {'ls': ':', 'alpha': 0.5, 'c': 'black'} if ax is None: plt.figure() plt.imshow(array, **im_kwargs) plt.colorbar() plt.axis("off") else: ax.imshow(array, **im_kwargs) if chrom_starts is not None: targ = plt if ax is None else ax for pos in chrom_starts: targ.axvline(pos, **li_kwargs) targ.axhline(pos, **li_kwargs) if title is not None: if ax is None: plt.title(title) else: ax.set_title(title) if filename: plt.savefig(filename, bbox_inches="tight", pad_inches=0.0, dpi=dpi) else: if ax is None: plt.show()
[docs]def normalize(M, norm="SCN"): """Attempt to normalize if hicstuff is found, does nothing otherwise. """ try: return normalize_sparse(M, norm=norm) except NameError: return M
[docs]def scaffold_distribution(genome, threshold=100, plot=True): """Visualize scaffold size distribution Compute (and optionally display) scaffold size distribution for a genome in fasta format. Parameters ---------- genome : str, file or pathlib.Path The genome scaffold file (or file handle) threshold : int, optional The size below which scaffolds are discarded, by default 1000000 plot : bool, optional Whether to plot the results Returns ------- list : The list of scaffold sizes in decreasing order. """ handle = SeqIO.parse(genome, "fasta") lengths = sorted( (len(u) for u in handle if len(u) > threshold), reverse=True ) if plot: x, y = zip(*enumerate(lengths)) plt.scatter(x=x, y=y) plt.show() return lengths
[docs]def reorder_fasta(genome, output, threshold=100000): """Reorder and trim a fasta file Sort a fasta file by record lengths, optionally trimming the smallest ones. Parameters ---------- genome : str, file or pathlib.Path The genome scaffold file (or file handle) output : str, file or pathlib.Path The output file to write to threshold : int, optional The size below which scaffolds are discarded, by default 100000 """ handle = SeqIO.parse(genome, "fasta") handle_to_write = sorted( (len(u) for u in handle if len(u) > threshold), reverse=True ) SeqIO.write(handle_to_write, output, "fasta")