Source code for hicstuff.filter

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Hi-C event filtering

Analyse the contents of a 3C library and filters spurious events such as loops
and uncuts to improve the overall signal. Filtering consists in excluding +-
and -+ Hi-C pairs if their reads are closer than a threshold in minimum number
of restriction fragments. This threshold represents the distance at which the
abundance of these events deviate significantly from the rest of the library.
It is estimated automatically by default using the median absolute deviation of
pairs at longer distances, but can also be entered manually.

The program takes a 2D BED file as input with the following fields:
chromA startA endA indiceA strandA chromB startB endB indiceB strandB
Each line of the file represents a Hi-C pair with reads A and B. The indices
are 0-based and represent the restriction fragment to which reads are
attributed.

@author: cmdoret (reimplementation of Axel KournaK's code)
"""

import sys
import numpy as np
from collections import OrderedDict
import matplotlib.pyplot as plt
from hicstuff.log import logger


[docs]def process_read_pair(line): r"""Process and order read pairs in a .pairs record. Takes a read pair (line) from a .pairs file as input, reorders the pair so that read 1 in intrachromosomal pairs always has the smallest genomic coordinate. Parameters ---------- line : str Record from a pairs file with columns: readID, chr1, pos1, chr2, pos2, strand1, strand2, frag1, frag2 Returns ------- dict Dictionary with reordered pair where each column from the line as an entry. The number of restriction sites separating reads in the pair and the type of event are also stored in the dictionary, under keys "nsites" and "type". Examples -------- >>> d = process_read_pair("readX\ta\t1\tb\t20\t-\t-\t1\t3") >>> for u in sorted(d.items()): ... print(u) ('chr1', 'a') ('chr2', 'b') ('frag1', 1) ('frag2', 3) ('nsites', 2) ('pos1', 1) ('pos2', 20) ('readID', 'readX') ('strand1', '-') ('strand2', '-') ('type', 'inter') >>> d = process_read_pair('readY\ta\t20\ta\t10\t-\t+\t2\t1') >>> [d[x] for x in sorted(d.keys())] ['a', 'a', 1, 2, 1, 10, 20, 'readY', '+', '-', '+-'] """ # Split line by whitespace p = line.split("\t") if len(p) != 9: raise ValueError( "Your input file does not have 9 columns. Make sure " "the file has the readID and twice the following 4 fields " "(once for each read in the pair): chr pos strand frag." ) # Saving each column as a dictionary entry with meaningful key cols = [ "readID", "chr1", "pos1", "chr2", "pos2", "strand1", "strand2", "frag1", "frag2", ] p = {cols[i]: p[i] for i in range(len(cols))} # Transforming numeric columns to int for col in ["pos1", "pos2", "frag1", "frag2"]: p[col] = int(p[col]) # invert records for intrachromosomal pairs where rec2 comes before # rec1 in genomic coordinates if p["chr1"] == p["chr2"] and p["pos2"] < p["pos1"]: p["strand1"], p["strand2"] = p["strand2"], p["strand1"] p["pos1"], p["pos2"] = p["pos2"], p["pos1"] p["frag1"], p["frag2"] = p["frag2"], p["frag1"] # Number of restriction sites separating reads in the pair p["nsites"] = abs(p["frag2"] - p["frag1"]) # Get event type if p["chr1"] == p["chr2"]: p["type"] = "".join([p["strand1"], p["strand2"]]) else: p["type"] = "inter" return p
[docs]def get_thresholds( in_dat, interactive=False, plot_events=False, fig_path=None, prefix=None ): """Guess distance threshold for event filtering Analyse the events in the first million of Hi-C pairs in the library, plot the occurrences of each event type according to number of restriction fragments, and ask user interactively for the minimum threshold for uncuts and loops. Parameters ---------- in_dat: str Path to the .pairs file containing Hi-C pairs. interactive: bool If True, plots are diplayed and thresholds are required interactively. plot_events : bool Whether to show the plot fig_path : str Path where the figure will be saved. If None, the figure will be diplayed interactively. prefix : str If the library has a name, it will be shown on plots. Returns ------- dictionary dictionary with keys "uncuts" and "loops" where the values are the corresponding thresholds entered by the user. """ thr_uncut = None thr_loop = None max_sites = 50 # Map of event -> legend name of event for intrachromosomal pairs. legend = { "++": "++ (weird)", "--": "-- (weird)", "+-": "+- (uncuts)", "-+": "-+ (loops)", } colors = {"++": "#222222", "+-": "r", "--": "#666666", "-+": "tab:orange"} n_events = {event: np.zeros(max_sites) for event in legend} i = 0 # open the file for reading (just the first 1 000 000 lines) with open(in_dat, "r") as pairs: for line in pairs: # Skip header lines if line.startswith("#"): continue i += 1 # Only use the first million pairs to estimate thresholds if i == 1000000: break # Process Hi-C pair into a dictionary p = process_read_pair(line) # Type of event and number of restriction site between reads etype = p["type"] nsites = p["nsites"] # Count number of events for intrachrom pairs if etype != "inter" and nsites < max_sites: n_events[etype][nsites] += 1 def plot_event(n_events, legend, name): """Plot the frequency of a given event types over distance.""" plt.xlim([-0.5, 15]) plt.plot( range(n_events[name].shape[0]), n_events[name], "o-", label=legend[name], linewidth=2.0, c=colors[name], ) if interactive: # PLot: try: plt.figure(0) for event in legend: plot_event(n_events, legend, event) plt.grid() plt.xlabel("Number of restriction fragment(s)") plt.ylabel("Number of events") plt.yscale("log") plt.legend() plt.show(block=False) except Exception: logger.error( "Unable to show plots, skipping figure generation. Perhaps " "there is no Xserver running ? (might be due to windows " "environment). Try running without the interactive option." ) # Asks the user for appropriate thresholds print( "Please enter the number of restriction fragments separating " "reads in a Hi-C pair below or at which loops and " "uncuts events will be excluded\n", file=sys.stderr, ) thr_uncut = int(input("Enter threshold for the uncuts events (+-):")) thr_loop = int(input("Enter threshold for the loops events (-+):")) try: plt.clf() except Exception: pass else: # Estimate thresholds from data for event in n_events: fixed = n_events[event] fixed[fixed == 0] = 1 n_events[event] = fixed all_events = np.log(np.array(list(n_events.values()))) # Compute median occurences at each restriction sites event_med = np.median(all_events, axis=0) # Compute MAD, to have a robust estimator of the expected deviation # from median at long distances mad = np.median(abs(all_events - event_med)) exp_stdev = mad / 0.67449 # Iterate over sites, from furthest to frag+2 for site in range(max_sites)[:1:-1]: # For uncuts and loops, keep the last (closest) site where the # deviation from other events <= expected_stdev if ( abs(np.log(n_events["+-"][site]) - event_med[site]) <= exp_stdev ): thr_uncut = site if ( abs(np.log(n_events["-+"][site]) - event_med[site]) <= exp_stdev ): thr_loop = site if thr_uncut is None or thr_loop is None: raise ValueError( "The threshold for loops or uncut could not be estimated. " "Please try running with -i to investigate the problem." ) logger.info( "Filtering with thresholds: uncuts={0} loops={1}".format( thr_uncut, thr_loop ) ) if plot_events: try: plt.figure(1) plt.xlim([-0.5, 15]) # Draw colored lines for events to discard plt.plot( range(0, thr_uncut + 1), n_events["+-"][: thr_uncut + 1], "o-", c=colors["+-"], label=legend["+-"], ) plt.plot( range(0, thr_loop + 1), n_events["-+"][: thr_loop + 1], "o-", c=colors["-+"], label=legend["-+"], ) plt.plot( range(0, 2), n_events["--"][:2], "o-", c=colors["--"], label=legend["--"], ) plt.plot( range(0, 2), n_events["++"][:2], "o-", c=colors["++"], label=legend["++"], ) # Draw black lines for events to keep plt.plot( range(thr_uncut, n_events["+-"].shape[0]), n_events["+-"][thr_uncut:], "o-", range(thr_loop, n_events["-+"].shape[0]), n_events["-+"][thr_loop:], "o-", range(1, n_events["--"].shape[0]), n_events["--"][1:], "o-", range(1, n_events["++"].shape[0]), n_events["++"][1:], "o-", label="kept", linewidth=2.0, c="g", ) plt.grid() plt.xlabel("Number of restriction site(s)") plt.ylabel("Number of events") plt.yscale("log") # Remove duplicate "kept" entries in legend handles, labels = plt.gca().get_legend_handles_labels() by_label = OrderedDict(zip(labels, handles)) plt.legend(by_label.values(), by_label.keys()) # Show uncut and loop threshold as vertical lines plt.axvline(x=thr_loop, color=colors["-+"]) plt.axvline(x=thr_uncut, color=colors["+-"]) if prefix: plt.title( "Library events by distance in {}".format(prefix) ) plt.tight_layout() if fig_path: plt.savefig(fig_path) else: plt.show(block=False) # plt.clf() except Exception: logger.error( "Unable to show plots, skipping figure generation. Is " "an X server running? (might be due to windows " "environment). Try running without the plot option." ) return thr_uncut, thr_loop
[docs]def filter_events( in_dat, out_filtered, thr_uncut, thr_loop, plot_events=False, fig_path=None, prefix=None, ): """Filter events (loops, uncuts and weirds) Filter out spurious intrachromosomal Hi-C pairs from input file. +- pairs with reads closer or at the uncut threshold and -+ pairs with reads closer or at the loop thresholds are excluded from the ouput file. -- and ++ pairs with both mates on the same fragments are also discarded. All others are written. Parameters ---------- in_dat : file object File handle in read mode to the 2D BED file containing Hi-C pairs. out_filtered : file object File handle in write mode the output filtered 2D BED file. thr_uncut : int Minimum number of restriction sites between reads to keep an intrachromosomal +- pair. thr_loop : int Minimum number of restriction sites between reads to keep an intrachromosomal -+ pair. plot_events : bool If True, a plot showing the proportion of each type of event will be shown after filtering. fig_path : str Path where the figure will be saved. If None, figure is displayed interactively. prefix : str If the library has a name, it will be shown on plots. """ n_uncuts = 0 n_loops = 0 n_weirds = 0 lrange_intra = 0 lrange_inter = 0 # open the files for reading and writing with open(in_dat, "r") as pairs, open(out_filtered, "w") as filtered: for line in pairs: # iterate over each line # Copy header lines to output if line.startswith("#"): filtered.write(line) continue p = process_read_pair(line) line_to_write = ( "\t".join( map( str, ( p["readID"], p["chr1"], p["pos1"], p["chr2"], p["pos2"], p["strand1"], p["strand2"], p["frag1"], p["frag2"], ), ) ) + "\n" ) if p["chr1"] == p["chr2"]: # Do not report ++ and -- pairs on the same fragment (impossible) if p["frag1"] == p["frag2"] and p["strand1"] == p["strand2"]: n_weirds += 1 elif p["nsites"] <= thr_loop and p["type"] == "-+": n_loops += 1 elif p["nsites"] <= thr_uncut and p["type"] == "+-": n_uncuts += 1 else: lrange_intra += 1 filtered.write(line_to_write) if p["chr1"] != p["chr2"]: lrange_inter += 1 filtered.write(line_to_write) if lrange_inter > 0: ratio_inter = round( 100 * lrange_inter / float(lrange_intra + lrange_inter), 2 ) else: ratio_inter = 0 # Log quick summary of operation results kept = lrange_intra + lrange_inter discarded = n_loops + n_uncuts + n_weirds total = kept + discarded logger.info( "Proportion of inter contacts: {0}% (intra: {1}, " "inter: {2})".format(ratio_inter, lrange_intra, lrange_inter) ) logger.info( "{0} pairs discarded: Loops: {1}, Uncuts: {2}, Weirds: {3}".format( discarded, n_loops, n_uncuts, n_weirds ) ) logger.info( "{0} pairs kept ({1}%)".format( kept, round(100 * kept / (kept + discarded), 2) ) ) # Visualize summary if requested by user if plot_events: try: # Plot: make a square figure and axes to plot a pieChart: plt.figure(2, figsize=(6, 6)) # The slices will be ordered and plotted counter-clockwise. fracs = [n_uncuts, n_loops, n_weirds, lrange_intra, lrange_inter] # Format labels to include event names and proportion labels = list( map( lambda x: (x[0] + ": %.2f%%") % (100 * x[1] / total), [ ("Uncuts", n_uncuts), ("Loops", n_loops), ("Weirds", n_weirds), ("3D intra", lrange_intra), ("3D inter", lrange_inter), ], ) ) colors = ["salmon", "lightskyblue", "yellow", "palegreen", "plum"] patches, _ = plt.pie(fracs, colors=colors, startangle=90) plt.legend( patches, labels, loc='upper left', bbox_to_anchor=(-0.1, 1.), ) if prefix: plt.title( "Distribution of library events in {}".format(prefix), bbox={"facecolor": "1.0", "pad": 5}, ) plt.text( 0.3, 1.15, "Threshold Uncuts = " + str(thr_uncut), fontdict=None, ) plt.text( 0.3, 1.05, "Threshold Loops = " + str(thr_loop), fontdict=None, ) plt.text( -1.5, -1.2, "Total number of reads = " + str(total), fontdict=None, ) plt.text( -1.5, -1.3, "Ratio inter/(intra+inter) = " + str(ratio_inter) + "%", fontdict=None, ) percentage = round( 100 * float(lrange_inter + lrange_intra) / (n_loops + n_uncuts + n_weirds + lrange_inter + lrange_intra) ) plt.text( -1.5, -1.4, "Selected reads = {0}%".format(percentage), fontdict=None, ) if fig_path: plt.savefig(fig_path) else: plt.show() plt.clf() except Exception: logger.error( "Unable to show plots. Perhaps there is no Xserver running ?" "(might be due to windows environment) skipping figure " "generation." )