[1]:
%load_ext autoreload
%autoreload 2
[2]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.ticker
import matplotlib.gridspec
import seaborn as sns

%matplotlib inline
plt.style.use('seaborn-poster')

import pandas as pd
import pairtools
import bioframe

Pairtools restrict walkthrough

The common approach to analyse Hi-C data is based to analyse the contacts of the restriction fragments. It is used in hiclib, Juicer, HiC-Pro.

Throughout this notebook, we will work with one of Rao et al. 2014 datasets for IMR90 cells [1].

[1] Rao, S. S., Huntley, M. H., Durand, N. C., Stamenova, E. K., Bochkov, I. D., Robinson, J. T., Sanborn, A. L., Machol, I., Omer, A. D., Lander, E. S., & Aiden, E. L. (2014). A 3D map of the human genome at kilobase resolution reveals principles of chromatin looping. Cell, 159(7), 1665–1680. https://doi.org/10.1016/j.cell.2014.11.021

Download the data from 4DN portal

To download the data from 4DN, you may need to register, get key and secret and write a spceialized curl command for your user:

[61]:
!curl -O -L --user RG6CSRMC:xlii3stnkphfygmu https://data.4dnucleome.org/files-processed/4DNFIW2BKSNF/@@download/4DNFIW2BKSNF.pairs.gz
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   330  100   330    0     0    931      0 --:--:-- --:--:-- --:--:--   932
100 3395M  100 3395M    0     0  29.7M      0  0:01:54  0:01:54 --:--:-- 33.1M 0:01:48  0:00:12  0:01:36 32.8M
[ ]:
%%bash
# Get total number of contacts to assess how many reads you can read in the future:
pairtools stats 4DNFIW2BKSNF.pairs.gz | head -n 1
# This will produce around 173 M pairs
[ ]:
%%bash
# Sample the fraction of pairs that will produce ~ 1 M of pairs:
pairtools sample 0.007 4DNFIW2BKSNF.pairs.gz -o 4DNFIW2BKSNF.pairs.sampled.gz

Annotate restriction fragments

[ ]:
%%bash
# Digest the genome into restriction fragments:
cooler digest ../tests_chromap/hg38/hg38.fa.sizes ../tests_chromap/hg38/hg38.fa MboI > hg38/hg38.MboI.restricted.bed
[ ]:
%%bash
# Annotate restriction fragments in the sampled file:
pairtools restrict -f hg38/hg38.MboI.restricted.bed 4DNFIW2BKSNF.pairs.sampled.gz -o 4DNFIW2BKSNF.pairs.sampled.restricted.gz

Read the pairs and analyse them as dataframe

[3]:
from pairtools.lib import headerops, fileio
[4]:
pairs_file = '4DNFIW2BKSNF.pairs.sampled.restricted.gz'
[5]:
pairs_stream = fileio.auto_open(pairs_file, 'r')
header, pairs_stream = headerops.get_header(pairs_stream)
columns = headerops.get_colnames(header)
[6]:
df = pd.read_table(pairs_stream, comment="#", header=None)
df.columns = columns
[ ]:

[7]:
df.loc[:, 'dist_rfrag1_left'] = df.pos1 - df.rfrag_start1
df.loc[:, 'dist_rfrag1_right'] = df.rfrag_end1 - df.pos1

df.loc[:, 'dist_rfrag2_left'] = df.pos2 - df.rfrag_start2
df.loc[:, 'dist_rfrag2_right'] = df.rfrag_end2 - df.pos2

Many of the 5’-ends of reads are mapped to the restriction sites:

[8]:
xmin = 0
xmax = 2000
step = 20

sns.distplot(df.query('strand1=="+"').dist_rfrag1_left, bins=np.arange(xmin, xmax, step), label='Distance from the 5\' read end to the nearest upstream rsite, + mapped reads')
sns.distplot(df.query('strand1=="+"').dist_rfrag1_right, bins=np.arange(xmin, xmax, step), label='Distance from the 5\' read end to the nearest downstream rsite, + mapped reads')

plt.xlim(xmin, xmax)
plt.legend()
plt.tight_layout()
../_images/examples_pairtools_restrict_walkthrough_18_0.png
[9]:
xmin = 0
xmax = 200
step = 1

sns.distplot(df.query('strand1=="+"').dist_rfrag1_left, bins=np.arange(xmin, xmax, step), label='Distance from the 5\' read end to the nearest upstream rsite, + mapped reads')
sns.distplot(df.query('strand1=="+"').dist_rfrag1_right, bins=np.arange(xmin, xmax, step), label='Distance from the 5\' read end to the nearest downstream rsite, + mapped reads')

plt.xlim(xmin, xmax)
plt.legend()
plt.tight_layout()
../_images/examples_pairtools_restrict_walkthrough_19_0.png

However, if we select only the pairs that map to the restriction sites, there is no significant skew in scaling:

[10]:
hg38_chromsizes = bioframe.fetch_chromsizes('hg38',
                                            as_bed=True)
hg38_cens = bioframe.fetch_centromeres('hg38')
hg38_arms = bioframe.make_chromarms(hg38_chromsizes,
                                    dict(hg38_cens.set_index('chrom').mid),
                                    cols_chroms=('chrom', 'start', 'end') )

# To fix pandas bug in some versions:
hg38_arms['start'] = hg38_arms['start'].astype(int)
hg38_arms['end'] = hg38_arms['end'].astype(int)
[11]:
import pairtools.lib.scaling as scaling
[12]:
def plot(cis_scalings, n, xlim=(1e1,1e9), label='' ):
    strand_gb = cis_scalings.groupby(['strand1', 'strand2'])
    for strands in ['+-', '-+', '++', '--']:
        sc_strand = strand_gb.get_group(tuple(strands))
        sc_agg = (sc_strand
            .groupby(['min_dist','max_dist'])
            .agg({'n_pairs':'sum', 'n_bp2':'sum'})
            .reset_index())

        dist_bin_mids = np.sqrt(sc_agg.min_dist * sc_agg.max_dist)
        pair_frequencies = sc_agg.n_pairs / sc_agg.n_bp2
        pair_frequencies = pair_frequencies/cis_scalings.n_pairs.sum()
        mask = pair_frequencies>0
        label_long = f'{strands[0]}{strands[1]} {label}'

        if np.sum(mask)>0:
            plt.loglog(
                dist_bin_mids[mask],
                pair_frequencies[mask],
                label=label_long,
                lw=2
            )

    plt.gca().xaxis.set_major_locator(matplotlib.ticker.LogLocator(base=10.0,numticks=20))
    plt.gca().yaxis.set_major_locator(matplotlib.ticker.LogLocator(base=10.0,numticks=20))
    plt.gca().set_aspect(1.0)
    plt.xlim(xlim)

    plt.grid(lw=0.5,color='gray')
    plt.legend(loc=(1.1,0.4))
    plt.ylabel('contact frequency, \nHi-C molecule per bp pair normalized by total')
    plt.xlabel('distance, bp')

    plt.tight_layout()
[13]:
# Get the pairs where R1 is far enough from site of restriction, but not too far
df_subset = df.query("(strand1=='+' and dist_rfrag1_left>5 and dist_rfrag1_left<=250)")
n_distant = len(df_subset)
cis_scalings_distant, trans_levels_distant = scaling.compute_scaling(
                df_subset,
                regions=hg38_arms,
                chromsizes=hg38_arms,
                dist_range=(10, 1e9),
                n_dist_bins=128,
                chunksize=int(1e7),
            )
plot(cis_scalings_distant, n_distant, label="pairs, 5' distant from rsite")


# Get the pairs where R1 is too far enough from site of restriction
df_subset = df.query("(strand1=='+' and dist_rfrag1_left>550)")
n_toodistant = len(df_subset)
cis_scalings_toodistant, trans_levels_toodistant = scaling.compute_scaling(
                df_subset,
                regions=hg38_arms,
                chromsizes=hg38_arms,
                dist_range=(10, 1e9),
                n_dist_bins=128,
                chunksize=int(1e7),
            )
plot(cis_scalings_toodistant, n_toodistant, label="pairs, 5' too far from rsite")


# Get the pairs where R1 is very close to the site of restriction
df_subset = df.query("(strand1=='+' and dist_rfrag1_left<5)")
n_tooclose = len(df_subset)
cis_scalings_tooclose, trans_levels_tooclose = scaling.compute_scaling(
                df_subset,
                regions=hg38_arms,
                chromsizes=hg38_arms,
                dist_range=(10, 1e9),
                n_dist_bins=128,
                chunksize=int(1e7),
            )
plot(cis_scalings_tooclose, n_tooclose, label="pairs, 5' close to rsite")
# Try another replicate of replicate, maybe the last one
../_images/examples_pairtools_restrict_walkthrough_24_0.png

How many pairs we take if not strictly filtering by dangling ends and self-circles?

[14]:
df.loc[:, "type_rfrag"] = "Regular pair"

mask_neighboring_rfrags = (np.abs(df.rfrag1-df.rfrag2)<=1)

mask_DE = (df.strand1=="+") & (df.strand2=="-") & mask_neighboring_rfrags
df.loc[mask_DE, "type_rfrag"] = "DanglingEnd"

mask_SS = (df.strand1=="-") & (df.strand2=="+") & mask_neighboring_rfrags
df.loc[mask_SS, "type_rfrag"] = "SelfCircle"

mask_Err = (df.strand1==df.strand2) & mask_neighboring_rfrags
df.loc[mask_Err, "type_rfrag"] = "Mirror"
[15]:
df.sort_values("type_rfrag").groupby("type_rfrag").count()['readID']
[15]:
type_rfrag
DanglingEnd       76902
Mirror             3214
Regular pair    1132002
SelfCircle         3036
Name: readID, dtype: int64
[16]:
# Full scaling

n = len(df)
cis_scalings, trans_levels = scaling.compute_scaling(
                df,
                regions=hg38_arms,
                chromsizes=hg38_arms,
                dist_range=(10, 1e9),
                n_dist_bins=128,
                chunksize=int(1e7),
            )
plot(cis_scalings, n, label="pairs")

# The point where the scalings by distance become balanced:
plt.axvline(2e3, ls='--', c='gray', label='Balancing point')

plt.savefig("./oriented_scalings.pdf")
../_images/examples_pairtools_restrict_walkthrough_28_0.png
[17]:
df.loc[:, "type_bydist"] = "Regular pair"

mask_ondiagonal = (np.abs(df.pos2-df.pos1)<=2e3)

mask_DE = (df.strand1=="+") & (df.strand2=="-") & mask_ondiagonal
df.loc[mask_DE, "type_bydist"] = "DanglingEnd"

mask_SS = (df.strand1=="-") & (df.strand2=="+") & mask_ondiagonal
df.loc[mask_SS, "type_bydist"] = "SelfCircle"

mask_Err = (df.strand1==df.strand2) & mask_ondiagonal
df.loc[mask_Err, "type_bydist"] = "Mirror"
[18]:
df.sort_values("type_bydist").groupby("type_bydist").count()['readID']
[18]:
type_bydist
DanglingEnd      135381
Mirror            18383
Regular pair    1053213
SelfCircle         8177
Name: readID, dtype: int64
[19]:
df.sort_values(["type_rfrag", "type_bydist"])\
    .groupby(["type_rfrag", "type_bydist"])\
    .count()[['readID']]\
    .reset_index()\
    .pivot(columns="type_bydist", index="type_rfrag")\
    .fillna(0).astype(int)
[19]:
readID
type_bydist DanglingEnd Mirror Regular pair SelfCircle
type_rfrag
DanglingEnd 76898 0 4 0
Mirror 0 3176 38 0
Regular pair 58483 15207 1052994 5318
SelfCircle 0 0 177 2859

False Positives are in 3rd row, False Negatives are in 3rd column. Filtering by distance is, thus, nearly as effective as filtering by restriction fragment, but removes additional pairs that can be potential undercut by restriction enzyme.

Removing all contacts closer than 2 Kb will remove Hi-C artifacts.