[1]:
%load_ext autoreload
%autoreload 2
[2]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker
import matplotlib.gridspec
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn-poster')
import pandas as pd
import pairtools
import bioframe
Pairtools restrict walkthrough
The common approach to analyse Hi-C data is based to analyse the contacts of the restriction fragments. It is used in hiclib, Juicer, HiC-Pro.
Throughout this notebook, we will work with one of Rao et al. 2014 datasets for IMR90 cells [1].
[1] Rao, S. S., Huntley, M. H., Durand, N. C., Stamenova, E. K., Bochkov, I. D., Robinson, J. T., Sanborn, A. L., Machol, I., Omer, A. D., Lander, E. S., & Aiden, E. L. (2014). A 3D map of the human genome at kilobase resolution reveals principles of chromatin looping. Cell, 159(7), 1665–1680. https://doi.org/10.1016/j.cell.2014.11.021
Download the data from 4DN portal
To download the data from 4DN, you may need to register, get key and secret and write a spceialized curl command for your user:
[61]:
!curl -O -L --user RG6CSRMC:xlii3stnkphfygmu https://data.4dnucleome.org/files-processed/4DNFIW2BKSNF/@@download/4DNFIW2BKSNF.pairs.gz
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 330 100 330 0 0 931 0 --:--:-- --:--:-- --:--:-- 932
100 3395M 100 3395M 0 0 29.7M 0 0:01:54 0:01:54 --:--:-- 33.1M 0:01:48 0:00:12 0:01:36 32.8M
[ ]:
%%bash
# Get total number of contacts to assess how many reads you can read in the future:
pairtools stats 4DNFIW2BKSNF.pairs.gz | head -n 1
# This will produce around 173 M pairs
[ ]:
%%bash
# Sample the fraction of pairs that will produce ~ 1 M of pairs:
pairtools sample 0.007 4DNFIW2BKSNF.pairs.gz -o 4DNFIW2BKSNF.pairs.sampled.gz
Annotate restriction fragments
[ ]:
%%bash
# Digest the genome into restriction fragments:
cooler digest ../tests_chromap/hg38/hg38.fa.sizes ../tests_chromap/hg38/hg38.fa MboI > hg38/hg38.MboI.restricted.bed
[ ]:
%%bash
# Annotate restriction fragments in the sampled file:
pairtools restrict -f hg38/hg38.MboI.restricted.bed 4DNFIW2BKSNF.pairs.sampled.gz -o 4DNFIW2BKSNF.pairs.sampled.restricted.gz
Read the pairs and analyse them as dataframe
[3]:
from pairtools.lib import headerops, fileio
[4]:
pairs_file = '4DNFIW2BKSNF.pairs.sampled.restricted.gz'
[5]:
pairs_stream = fileio.auto_open(pairs_file, 'r')
header, pairs_stream = headerops.get_header(pairs_stream)
columns = headerops.get_colnames(header)
[6]:
df = pd.read_table(pairs_stream, comment="#", header=None)
df.columns = columns
[ ]:
[7]:
df.loc[:, 'dist_rfrag1_left'] = df.pos1 - df.rfrag_start1
df.loc[:, 'dist_rfrag1_right'] = df.rfrag_end1 - df.pos1
df.loc[:, 'dist_rfrag2_left'] = df.pos2 - df.rfrag_start2
df.loc[:, 'dist_rfrag2_right'] = df.rfrag_end2 - df.pos2
Many of the 5’-ends of reads are mapped to the restriction sites:
[8]:
xmin = 0
xmax = 2000
step = 20
sns.distplot(df.query('strand1=="+"').dist_rfrag1_left, bins=np.arange(xmin, xmax, step), label='Distance from the 5\' read end to the nearest upstream rsite, + mapped reads')
sns.distplot(df.query('strand1=="+"').dist_rfrag1_right, bins=np.arange(xmin, xmax, step), label='Distance from the 5\' read end to the nearest downstream rsite, + mapped reads')
plt.xlim(xmin, xmax)
plt.legend()
plt.tight_layout()
[9]:
xmin = 0
xmax = 200
step = 1
sns.distplot(df.query('strand1=="+"').dist_rfrag1_left, bins=np.arange(xmin, xmax, step), label='Distance from the 5\' read end to the nearest upstream rsite, + mapped reads')
sns.distplot(df.query('strand1=="+"').dist_rfrag1_right, bins=np.arange(xmin, xmax, step), label='Distance from the 5\' read end to the nearest downstream rsite, + mapped reads')
plt.xlim(xmin, xmax)
plt.legend()
plt.tight_layout()
However, if we select only the pairs that map to the restriction sites, there is no significant skew in scaling:
[10]:
hg38_chromsizes = bioframe.fetch_chromsizes('hg38',
as_bed=True)
hg38_cens = bioframe.fetch_centromeres('hg38')
hg38_arms = bioframe.make_chromarms(hg38_chromsizes,
dict(hg38_cens.set_index('chrom').mid),
cols_chroms=('chrom', 'start', 'end') )
# To fix pandas bug in some versions:
hg38_arms['start'] = hg38_arms['start'].astype(int)
hg38_arms['end'] = hg38_arms['end'].astype(int)
[11]:
import pairtools.lib.scaling as scaling
[12]:
def plot(cis_scalings, n, xlim=(1e1,1e9), label='' ):
strand_gb = cis_scalings.groupby(['strand1', 'strand2'])
for strands in ['+-', '-+', '++', '--']:
sc_strand = strand_gb.get_group(tuple(strands))
sc_agg = (sc_strand
.groupby(['min_dist','max_dist'])
.agg({'n_pairs':'sum', 'n_bp2':'sum'})
.reset_index())
dist_bin_mids = np.sqrt(sc_agg.min_dist * sc_agg.max_dist)
pair_frequencies = sc_agg.n_pairs / sc_agg.n_bp2
pair_frequencies = pair_frequencies/cis_scalings.n_pairs.sum()
mask = pair_frequencies>0
label_long = f'{strands[0]}{strands[1]} {label}'
if np.sum(mask)>0:
plt.loglog(
dist_bin_mids[mask],
pair_frequencies[mask],
label=label_long,
lw=2
)
plt.gca().xaxis.set_major_locator(matplotlib.ticker.LogLocator(base=10.0,numticks=20))
plt.gca().yaxis.set_major_locator(matplotlib.ticker.LogLocator(base=10.0,numticks=20))
plt.gca().set_aspect(1.0)
plt.xlim(xlim)
plt.grid(lw=0.5,color='gray')
plt.legend(loc=(1.1,0.4))
plt.ylabel('contact frequency, \nHi-C molecule per bp pair normalized by total')
plt.xlabel('distance, bp')
plt.tight_layout()
[13]:
# Get the pairs where R1 is far enough from site of restriction, but not too far
df_subset = df.query("(strand1=='+' and dist_rfrag1_left>5 and dist_rfrag1_left<=250)")
n_distant = len(df_subset)
cis_scalings_distant, trans_levels_distant = scaling.compute_scaling(
df_subset,
regions=hg38_arms,
chromsizes=hg38_arms,
dist_range=(10, 1e9),
n_dist_bins=128,
chunksize=int(1e7),
)
plot(cis_scalings_distant, n_distant, label="pairs, 5' distant from rsite")
# Get the pairs where R1 is too far enough from site of restriction
df_subset = df.query("(strand1=='+' and dist_rfrag1_left>550)")
n_toodistant = len(df_subset)
cis_scalings_toodistant, trans_levels_toodistant = scaling.compute_scaling(
df_subset,
regions=hg38_arms,
chromsizes=hg38_arms,
dist_range=(10, 1e9),
n_dist_bins=128,
chunksize=int(1e7),
)
plot(cis_scalings_toodistant, n_toodistant, label="pairs, 5' too far from rsite")
# Get the pairs where R1 is very close to the site of restriction
df_subset = df.query("(strand1=='+' and dist_rfrag1_left<5)")
n_tooclose = len(df_subset)
cis_scalings_tooclose, trans_levels_tooclose = scaling.compute_scaling(
df_subset,
regions=hg38_arms,
chromsizes=hg38_arms,
dist_range=(10, 1e9),
n_dist_bins=128,
chunksize=int(1e7),
)
plot(cis_scalings_tooclose, n_tooclose, label="pairs, 5' close to rsite")
# Try another replicate of replicate, maybe the last one
How many pairs we take if not strictly filtering by dangling ends and self-circles?
[14]:
df.loc[:, "type_rfrag"] = "Regular pair"
mask_neighboring_rfrags = (np.abs(df.rfrag1-df.rfrag2)<=1)
mask_DE = (df.strand1=="+") & (df.strand2=="-") & mask_neighboring_rfrags
df.loc[mask_DE, "type_rfrag"] = "DanglingEnd"
mask_SS = (df.strand1=="-") & (df.strand2=="+") & mask_neighboring_rfrags
df.loc[mask_SS, "type_rfrag"] = "SelfCircle"
mask_Err = (df.strand1==df.strand2) & mask_neighboring_rfrags
df.loc[mask_Err, "type_rfrag"] = "Mirror"
[15]:
df.sort_values("type_rfrag").groupby("type_rfrag").count()['readID']
[15]:
type_rfrag
DanglingEnd 76902
Mirror 3214
Regular pair 1132002
SelfCircle 3036
Name: readID, dtype: int64
[16]:
# Full scaling
n = len(df)
cis_scalings, trans_levels = scaling.compute_scaling(
df,
regions=hg38_arms,
chromsizes=hg38_arms,
dist_range=(10, 1e9),
n_dist_bins=128,
chunksize=int(1e7),
)
plot(cis_scalings, n, label="pairs")
# The point where the scalings by distance become balanced:
plt.axvline(2e3, ls='--', c='gray', label='Balancing point')
plt.savefig("./oriented_scalings.pdf")
[17]:
df.loc[:, "type_bydist"] = "Regular pair"
mask_ondiagonal = (np.abs(df.pos2-df.pos1)<=2e3)
mask_DE = (df.strand1=="+") & (df.strand2=="-") & mask_ondiagonal
df.loc[mask_DE, "type_bydist"] = "DanglingEnd"
mask_SS = (df.strand1=="-") & (df.strand2=="+") & mask_ondiagonal
df.loc[mask_SS, "type_bydist"] = "SelfCircle"
mask_Err = (df.strand1==df.strand2) & mask_ondiagonal
df.loc[mask_Err, "type_bydist"] = "Mirror"
[18]:
df.sort_values("type_bydist").groupby("type_bydist").count()['readID']
[18]:
type_bydist
DanglingEnd 135381
Mirror 18383
Regular pair 1053213
SelfCircle 8177
Name: readID, dtype: int64
[19]:
df.sort_values(["type_rfrag", "type_bydist"])\
.groupby(["type_rfrag", "type_bydist"])\
.count()[['readID']]\
.reset_index()\
.pivot(columns="type_bydist", index="type_rfrag")\
.fillna(0).astype(int)
[19]:
readID | ||||
---|---|---|---|---|
type_bydist | DanglingEnd | Mirror | Regular pair | SelfCircle |
type_rfrag | ||||
DanglingEnd | 76898 | 0 | 4 | 0 |
Mirror | 0 | 3176 | 38 | 0 |
Regular pair | 58483 | 15207 | 1052994 | 5318 |
SelfCircle | 0 | 0 | 177 | 2859 |
False Positives are in 3rd row, False Negatives are in 3rd column. Filtering by distance is, thus, nearly as effective as filtering by restriction fragment, but removes additional pairs that can be potential undercut by restriction enzyme.
Removing all contacts closer than 2 Kb will remove Hi-C artifacts.