This single cell ATAC-Seq analysis pipeline is designed for advanced analysis of dataset, produced by 10X Genomics Cell Ranger ATAC.\ Aggregated datasets are also supported!
In addition to 10X Genomics results it offers:
Required 10X Genomics Cell Ranger ATAC files:
fragments.tsv
- fragments matrix file provided by Cell Rangerpeaks.bed
- peaks file (union of peaks) in case of mergingclusters.csv
- clustering provided by Cell Rangerprojection.csv
- t-SNE 2d project of all the cellsPipeline steps:
Other pipelines:
Source code is available at: https://github.com/JetBrains-Research/sc-atacseq-explorer \ Questions and comments are welcome at os at jetbrains dot com
# # Configuration of 10X Genomics files
# FRAGMENTS_FILE_10X = 'fragments.tsv.gz'
# CLUSTERS_FILE_10X = 'clusters.csv'
# TSNE_FILE_10X = 'tsne.csv'
# PEAKS_FILE = 'peaks.bed'
# # See https://www.encodeproject.org/annotations/ENCSR636HFF/
# # Set this value to None if blacklist regions bed file is not available
# BLACKLIST_FILE = 'blacklist.bed'
# # For mm10 and hg38
# # ! wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.annotation.gtf.gz
# # ! wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/gencode.v32.annotation.gtf.gz
# # ! gunzip *.gtf.gz
# GTF_FILE = 'gencodegenes.gtf'
# # Dowload DNAse hypersensitivity sites from encodeproject.org
# # For mm10
# # ! wget https://www.encodeproject.org/files/ENCFF108APF/@@download/ENCFF108APF.bed.gz
# # ! gunzip *.gz
# # Set this to None if DNASe file is not available
# DNASE_FILE = 'dnase.bed'
# # Output configuration
# OUTPUT_DIR = 'pipeline'
# # Output configuration
# OUTPUT_DIR = 'pipeline'
# Configuration of 10X Genomics files
FRAGMENTS_FILE_10X = '/mnt/stripe/shpynov/2019_sc_atacseq_DT1755_3vs3_human/ALAW-XNN-AS011/outs/fragments.tsv.gz'
CLUSTERS_FILE_10X = '/mnt/stripe/shpynov/2019_sc_atacseq_DT1755_3vs3_human/ALAW-XNN-AS011/outs/analysis/clustering/graphclust/clusters.csv'
TSNE_FILE_10X = '/mnt/stripe/shpynov/2019_sc_atacseq_DT1755_3vs3_human/ALAW-XNN-AS011/outs/analysis/tsne/2_components/projection.csv'
PEAKS_FILE = '/mnt/stripe/shpynov/2019_sc_atacseq_DT1755_3vs3_human/ALAW-XNN-AS011/outs/peaks.bed'
# See https://www.encodeproject.org/annotations/ENCSR636HFF/
# Set this value to None if blacklist regions bed file is not available
BLACKLIST_FILE = None
# For mm10 and hg38
# ! wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.annotation.gtf.gz
# ! wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/gencode.v32.annotation.gtf.gz
# ! gunzip *.gtf.gz
GTF_FILE = '/mnt/stripe/shpynov/2019_sc_atacseq_DT1755_3vs3_human/gencode.v32.annotation.gtf'
# Dowload DNAse hypersensitivity sites from encodeproject.org
# For mm10
# ! wget https://www.encodeproject.org/files/ENCFF108APF/@@download/ENCFF108APF.bed.gz
# ! gunzip *.gz
# Set this to None if DNASe file is not available
DNASE_FILE = None
# Output configuration
OUTPUT_DIR = '/mnt/stripe/shpynov/2019_sc_atacseq_DT1755_3vs3_human/pipeline'
# # Factors configuration, number should be consistent with number of merged
# FACTOR = 'factor'
# FACTORS_MAP = {1: 'f1', 2: 'f2'}
# # Aggregated barcodes always have suffix as a marker of original track
# def FACTOR_FUNCTION(x):
# for k in [1, 2]:
# if x.endswith(f'-{k}'):
# return k
# # Cell calling filtration thresholds
# NOISE_THRESHOLD = 200
# DUPLETS_THRESHOLD = 8000
# # Default by Cell Ranger ATAC
# LSA_COMPONENTS = 15
# # 20 clusters is default by Cell Ranger ATAC
# N_CLUSTERS = 20
# # Markers configuration
# MARKERS_GENES = []
# # Export to Single Cell Explorer configuration
# SCE_TOKEN = '2020_scatacseq'
# SCE_NAME = 'Single cell ATAC-Seq'
# SCE_PEAK_GENE_TSS_MAX_DISTANCE = 10000
# SCE_PUBLIC = False
# SCE_ORGANISM = 'mm' # mm or hg are supported
# Factors configuration, number should be consistent with number of merged
FACTOR = 'age'
FACTORS_MAP = {1: 'young', 2: 'young', 3: 'young', 4: 'old', 5:'old', 6: 'old'}
# Aggregated barcodes always have suffix as a marker of original track
def FACTOR_FUNCTION(x):
for k in FACTORS_MAP.keys():
if x.endswith(f'-{k}'):
return k
# Cell calling filtration thresholds
NOISE_THRESHOLD = 2000
DUPLETS_THRESHOLD = 20000
# Default by Cell Ranger ATAC
LSA_COMPONENTS = 15
# 20 clusters is default by Cell Ranger ATAC
N_CLUSTERS = 20
# Markers configuration
MARKERS_GENES = ['Igkc', 'Jchain', 'Ifit3', 'Ifit1', 'Mki67', 'Cd4', 'Gzmb', 'Pld4', 'Ighe', 'Ighd', 'Ighm',
'Cd19', 'Cd79a', 'Cd8b', 'Il7r', 'Tcf7', 'Cd3e', 'Cd3d', 'Ccr7', 'Cd1c', 'Fcer1a', 'Cd34',
'Gata2', 'Pf4', 'Ppbp', 'Sox4', 'Eomes', 'Gzmk', 'Klrg1', 'Trav1-2', 'Ccl5', 'Sell',
'Ikzf2', 'Foxp3', 'Gzma', 'Ncam1', 'Prf1', 'Trdc', 'Nkg7', 'Ncr1', 'Batf3', 'Clec9a',
'Lyz', 'Flt3', 'Fcrg3a', 'Cd14', 'Cd44', 'Seprina1', 'Trgv9', 'Trgv2']
# Export to Single Cell Explorer configuration
SCE_TOKEN = '2019_scatacseq_human_3vs3'
SCE_NAME = 'Single cell ATAC-Seq human 3vs3'
SCE_PEAK_GENE_TSS_MAX_DISTANCE = 10000
SCE_PUBLIC = False
SCE_ORGANISM = 'hg' # mm or hg are supported
# Ensure all the libraries are installed
%matplotlib inline
%config InlineBackend.figure_format ='retina'
import gc
import glob
import hashlib
import math
import matplotlib.pyplot as plt
import numpy as np
# Workaround for the issue: https://github.com/pandas-dev/pandas/issues/26314
# pip install pandas==0.21
import pandas as pd
import os
import re
import sys
import seaborn as sns
import umap
sns.set_style("whitegrid")
import re
import tempfile
import glob
import pyBigWig
import json
import h5py
from scipy import io, sparse
from scipy.cluster import hierarchy
import matplotlib.cm as cm
from matplotlib.colors import LinearSegmentedColormap
from collections import Counter
from sklearn.cluster import AgglomerativeClustering
from sklearn.manifold import TSNE
from sklearn.preprocessing import Normalizer
from pybedtools import BedTool
from tqdm.auto import tqdm
from scipy.stats import zscore
from matplotlib.backends.backend_pdf import PdfPages
# Import pipeline functions
# These functions are taken from the original Cell Ranger 10x ATAC pipeline
# See https://github.com/10XGenomics/cellranger/
from irlb import irlb
from diffexp import *
# Fix random seed
np.random.seed(0)
# Create pipeline output folders
figures_path = os.path.join(OUTPUT_DIR, 'figures')
! mkdir -p {figures_path}
! mkdir -p {OUTPUT_DIR}/cache
# Used memory analysis utility
def sizeof_fmt(num, suffix='B'):
for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
if abs(num) < 1024.0:
return "%3.1f %s%s" % (num, unit, suffix)
num /= 1024.0
return "%.1f %s%s" % (num, 'Yi', suffix)
# Cleanup
gc.collect()
# Print all allocated variables
def print_mem_usage():
for name, size in sorted(((name, sys.getsizeof(value)) for name, value in globals().items()),
key= lambda x: x[1],
reverse=True)[:10]:
print("Global {:>30}: {:>8}".format(name, sizeof_fmt(size)))
for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
key= lambda x: x[1],
reverse=True)[:10]:
print("Local {:>30}: {:>8}".format(name, sizeof_fmt(size)))
# clusters10xdf = pd.read_csv(CLUSTERS_FILE_10X, sep=',')
# clusters10xdf['Cluster'] = clusters10xdf['Cluster'].astype(int) - 1 # Clusters start with 1 in 10x
# clusters10xdf[FACTOR] = [FACTOR_FUNCTION(bc) for bc in clusters10xdf['Barcode']]
# # display(clusters10xdf.head())
# print('Total clusters', len(set(clusters10xdf['Cluster'])))
def clusters_sizes(df, value):
df_sum = df[['cluster', value]].groupby(['cluster']).sum().reset_index()
plt.figure(figsize=(10, 4))
cmap=plt.cm.get_cmap('jet', len(df_sum))
sns.barplot(x=df_sum.index, y=df_sum[value],
palette=[cmap(i) for i in range(len(df_sum))])
def clusters_factors(df, value, FACTORS_MAP):
df_sum = df[['cluster', value]].groupby(['cluster']).sum().reset_index()
df_sum_factor = df[['cluster', FACTOR, value]].groupby(['cluster', FACTOR]).sum().reset_index()
plt.figure(figsize=(10, 4))
cmap=plt.cm.get_cmap('jet', len(FACTORS_MAP))
bottom = np.zeros(len(df_sum))
for i, (k, v) in enumerate(FACTORS_MAP.items()):
heights = np.array(df_sum_factor.loc[df_sum_factor[FACTOR] == k][value]) /\
np.array(df_sum[value]) * 100.0
plt.bar(x = df_sum.index, height = heights, bottom=bottom, color = cmap(i), width=0.8)
bottom += heights
# Cosmetics
plt.xticks(df_sum.index)
plt.gca().xaxis.grid(False)
plt.gca().margins(x=0.005)
legend_colors = [plt.Rectangle((0,0), 1, 1, fc=cmap(i), edgecolor = 'none')
for i in range(len(FACTORS_MAP))]
legend_names = [v for _,v in FACTORS_MAP.items()]
l = plt.legend(legend_colors, legend_names, prop={'size': 10}, bbox_to_anchor=(1.01, 0.99))
l.draw_frame(False)
plt.ylabel('Cluster composition ' + FACTOR + ' %')
plt.xlabel('Cluster')
# print('Size of 10x clusters analysis')
# t = clusters10xdf.copy()
# t['counts'] = np.ones(len(t))
# t.rename(columns={'Cluster': 'cluster'}, inplace=True)
# t.index = t['Barcode']
# print('Summary by', FACTOR, FACTORS_MAP)
# display(t[[FACTOR, 'counts']].groupby([FACTOR]).sum())
# clusters_sizes(t[['cluster', 'counts', FACTOR]], 'counts')
# plt.show()
# clusters_factors(t[['cluster', 'counts', FACTOR]], 'counts', FACTORS_MAP)
# plt.show()
# # Cleanup memory
# del t
# gc.collect()
def plot_colored(xs, ys, c,
clusters=False, show_centers=False,
size=10, alpha=0.5, no_ticks=True, fig_ax=None):
""" Plot colored scatterplot """
cmap = plt.cm.get_cmap('jet', len(set(c))) if clusters else plt.cm.viridis
if fig_ax is not None:
fig, ax = fig_ax
else:
fig, ax = plt.subplots(1, figsize=(10, 8))
sc = ax.scatter(xs, ys,
c=c, cmap=cmap,
marker='o',
alpha=alpha,
s=size)
mesh = ax.get_children()[0]
if no_ticks:
plt.setp(ax, xticks=[], yticks=[])
if clusters and show_centers:
# For each cluster, we add a text with name
for cluster in set(c):
cluster_x = np.mean(xs.iloc[np.flatnonzero(c == cluster)])
cluster_y = np.mean(ys.iloc[np.flatnonzero(c == cluster)])
ax.text(cluster_x, cluster_y, str(cluster),
horizontalalignment='center', size='large', color='black', weight='bold')
if clusters:
cbar = plt.colorbar(mesh, ax=ax, boundaries=np.arange(len(set(c)) + 1) + min(c) - 0.5)
cbar.set_ticks(np.arange(len(set(c))) + min(c))
cbar.set_ticklabels(list(set(c)))
else:
plt.colorbar(mesh, ax=ax)
def n_rows_columns(n, maxcols=5):
ncols = max([r for r in range(1, min(maxcols, math.ceil(math.sqrt(n)) + 1)) if n % r == 0])
nrows = int(n / ncols)
return nrows, ncols
def plot_colored_split(xs, ys, c, size=10, alpha=0.5, contrast = 100):
""" Plot colored scatterplot split by factors"""
cmap = plt.cm.get_cmap('jet', len(set(c)))
nrows, ncols = n_rows_columns(len(set(c)))
plt.subplots(nrows=nrows, ncols=ncols, figsize=(5 * ncols, 5 * nrows))
xmin, xmax = np.min(xs), np.max(xs)
xlim = (xmin - 0.1 * (xmax - xmin), xmax + 0.1 * (xmax - xmin)) # +10% margin
ymin, ymax = np.min(ys), np.max(ys)
ylim = (ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)) # +10% margin
for i, f in enumerate(tqdm(set(c))):
ax = plt.subplot(nrows, ncols, i + 1)
plt.setp(ax, xticks=[], yticks=[])
plt.xlim(xlim)
plt.ylim(ylim)
plt.title(str(f))
# Plot slightly visible other as gray
nfxs = xs.iloc[np.flatnonzero(np.logical_not(np.equal(c, f)))]
nfys = ys.iloc[np.flatnonzero(np.logical_not(np.equal(c, f)))]
plt.scatter(nfxs, nfys,
color='gray',
marker='o',
alpha=alpha / contrast,
s=size)
# And factor data
fxs = xs.iloc[np.flatnonzero(np.equal(c, f))]
fys = ys.iloc[np.flatnonzero(np.equal(c, f))]
plt.scatter(fxs, fys,
color=cmap(i),
marker='o',
alpha=alpha,
s=size)
# print('10x Cell Ranger ATAC original T-SNE visualization')
# tsne10xpath = TSNE_FILE_10X
# tsne10xdf = pd.read_csv(tsne10xpath, sep=',')
# tsne_clusters_10xdf = pd.merge(left=tsne10xdf,right=clusters10xdf, left_on='Barcode', right_on='Barcode')
# plot_colored(tsne_clusters_10xdf['TSNE-1'], tsne_clusters_10xdf['TSNE-1'], tsne_clusters_10xdf['Cluster'],
# clusters=True,
# show_centers=True,
# size=20, alpha=0.5)
# plt.show()
# print('10x tSNE on LSA normalized vs', FACTOR, FACTORS_MAP)
# plot_colored(tsne_clusters_10xdf['TSNE-1'], tsne_clusters_10xdf['TSNE-1'], clusters10xdf[FACTOR],
# clusters=True, size=20, alpha=0.1)
# plt.show()
# Uncomment to plot split by factor
# with PdfPages(os.path.join(figures_path, '10x_tsne_clusters_split.pdf')) as pdf:
# plot_colored_split(tsne_clusters_10xdf['TSNE-1'], tsne_clusters_10xdf['TSNE-2'], clusters10xdf[FACTOR],
# size=5, alpha=0.1)
# pdf.savefig()
with PdfPages(os.path.join(figures_path, 'peaks_length.pdf')) as pdf:
peaks_df = pd.read_csv(PEAKS_FILE, sep='\t', header=None)
print('Peaks', len(peaks_df))
sns.distplot(np.log10(peaks_df[2] - peaks_df[1]))
plt.title('Peak length distribution')
plt.xlabel('Log10 Length')
plt.ylabel('Frequency')
if DNASE_FILE is not None:
dnase = BedTool(DNASE_FILE)
peaks_file = BedTool(PEAKS_FILE)
peak_count = peaks_file.count()
overlap = peaks_file.intersect(dnase, wa=True, u=True).count()
peak_by_dnase = 0 if overlap == 0 else overlap * 100.0 / peak_count
overlap = dnase.intersect(peaks_file, wa=True, u=True).count()
dnase_by_peak = overlap * 100.0 / dnase.count()
print('Fraction of peaks overlapping with representative DNAse', int(peak_by_dnase), '%')
print('Fraction of representative DNAse overlapping with peaks', int(dnase_by_peak), '%')
Pipeline Preprocess GENES to TSS and use bedtools closest -D b
to report distance to TSS, upstream or downstream w.r.t. the gene strand.
Cell Ranger ATAC We use bedtools closest -D b
to associate each peak with genes based on closest transcription start sites (packaged within the reference) such that the peak is within 1000 bases upstream or 100 bases downstream of the TSS.
def gtf_to_tss(gtf_file, transcripts_tss):
print(f'GTF file {gtf_file} > {transcripts_tss}')
GTF_TO_TSS_SH = """
# Compute transcript name field in GTF file
TNF=$(cat $1 | grep transcript_id | grep -v '#' | head -n 1 |\
awk '{for (i=4; i<NF; i++) {if ($3=="transcript" && $i=="gene_name") print (i+1)}}')
# Process GTF file into transcripts
cat $1 |\
awk -v TNF=$TNF -v OFS="\t" '{if ($3=="transcript") {print $1,$4-1,$5,$TNF,0,$7}}' |\
tr -d '";' |\
awk -v OFS="\t" '{if ($6=="+") {print $1,$2,$2+1,$4,0,"+"} else {print $1,$3-1,$3,$4,0,"-"}}' |\
sort -k1,1 -k2,2n -k3,3n --unique > $2
"""
with tempfile.NamedTemporaryFile(prefix='gtf_to_tss', suffix='.sh', delete=False) as f:
f.write(GTF_TO_TSS_SH.encode('utf-8'))
f.close()
! bash {f.name} {gtf_file} {transcripts_tss}
transcripts_tss = os.path.join(OUTPUT_DIR, 'cache', os.path.basename(GTF_FILE) + '.tss')
if not os.path.exists(transcripts_tss):
! bash gtf2tss.ssh {GTF_FILE}, {transcripts_tss}
def locations_closest_genes(locations, transcripts_tss):
print('Annotate list of locations in format "chr:start-end" with closest tss of genes')
with tempfile.NamedTemporaryFile(delete=False) as f:
# Save index to restore original order after sorting
for i, t in enumerate([re.split(':|-', p) for p in locations]):
f.write('{}\t{}\t{}\t{}\n'.format(*t, i).encode('utf-8'))
f.close()
closest_file = f.name + '.closest'
sorted_file = f.name + '.sorted'
! sort -k1,1 -k2,2n {f.name} > {sorted_file}
! bedtools closest -a {sorted_file} -b {transcripts_tss} -D b > {closest_file}
closest_df = pd.read_csv(
closest_file, sep='\t',
names=['chr', 'start', 'end', 'index',
'gene_chr', 'gene_start', 'gene_end', 'gene', 'gene_score', 'gene_strand', 'distance'])
closest_df['gene_length'] = [len(g) for g in closest_df['gene']]
# Sort_by_peaks and gene length in case of several existing genes
closest_df.sort_values(by=['index', 'gene_length'], inplace=True)
# HACK: Pick only first closest gene with shortest name in case of several
# Example: hg38 GZMK and AC034238.1 -> GZMK
closest_df.drop_duplicates(['index'], inplace=True)
return closest_df[['chr', 'start', 'end', 'gene', 'distance']]
def find_peaks_for_markers(peaks_df, markers_genes):
print('Building genes peaks, closest peaks is mapped to gene, second closest with suffix _1, etc')
closest_genes = locations_closest_genes(
peaks_df[0] + ':' + peaks_df[1].astype(str) + '-' + peaks_df[2].astype(str),
transcripts_tss
)
# Restore 'peak' to be able to merge on it
closest_genes['peak'] = closest_genes['chr'] +\
':' + closest_genes['start'].astype(str) +\
'-' + closest_genes['end'].astype(str)
t = closest_genes[['gene', 'peak']].copy()
t['distance'] = np.abs(closest_genes['distance'])
# Filter peak close to gene tss of markers_genes
marker_genes_lower = set([s.lower() for s in markers_genes])
t = t.loc[np.logical_and(t['distance'] < 2000, [g.lower() in marker_genes_lower for g in t['gene']])]
t.sort_values(by=['gene', 'distance'], inplace=True)
marker_regions_peaks_map = {}
prev_gene = None
prev_count = 1
seen_genes = set()
for _, (gene, peak, distance) in t.iterrows():
if prev_gene != gene:
marker_regions_peaks_map[gene] = peak
seen_genes.add(gene.lower())
prev_gene = gene
prev_count = 1
elif prev_count < 3: # Max regions per gene
marker_regions_peaks_map[f'{gene}_{prev_count}'] = peak
prev_count += 1
print(f'Found {len(marker_regions_peaks_map)} marker peaks for {len(markers_genes)} peaks close to TSS')
for g in markers_genes:
if g.lower() not in seen_genes:
print(f'Missing peak for {g}')
for k, v in marker_regions_peaks_map.items():
print(f'{k}: {v}')
return marker_regions_peaks_map
# IMPORTANT: check peaks, important for potential clusters identification without bigwig visualization
marker_regions_peaks_map = find_peaks_for_markers(peaks_df, MARKERS_GENES)
Pipeline uses noise_threshold and duplets_threshold.
Cell Ranger ATAC aggregation: when combining data from multiple GEM groups, the cellranger-atac aggr pipeline automatically equalizes the sensitivity of the groups before merging, which is the recommended approach in order to avoid the batch effect introduced by sequencing depth. Default method: Subsample fragments from higher-depth GEM wells until they all have an equal number of unique fragments per cell.
For each barcode:
This works better in practice as compared to naively using the number of fragments per barcode. We first subtract a depth-dependent fixed count from all barcode counts to model whitelist contamination. This fixed count is the estimated number of fragments per barcode that originated from a different GEM, assuming a contamination rate of 0.02. Then we fit a mixture model of two negative binomial distributions to capture the signal and noise. Setting an odds ratio of 1000, we separate the barcodes that correspond to real cells from the non-cell barcodes.
Scasat If a cell has open peaks below a user defined threshold (default: 50 peaks) in the peak accessibility matrix we would remove that cell from subsequent analysis. Also peaks not observed across a user defined number of valid cells (default: 10 cells) are not considered for further downstream analysis.
SnapATAC alternative approach - 1kb, 5kb and 10kb resolution.
Existing computational methods rely on pre-defined regions of transposase accessibility identified from the aggregate signals. ...
Limitations:
# Intersect using bedtools rather than pybedtools, because they are too slow for files of this size!
def intersect_fragments_and_peaks(fragments_file_gz, peaks_file, blacklist_file):
print('Fragments')
! zcat {fragments_file_gz} | wc -l
idf = None
if blacklist_file is not None:
print('Blacklist')
! wc -l {blacklist_file}
print('Peaks')
! wc -l {peaks_file}
with tempfile.TemporaryDirectory(prefix='pipeline', dir=OUTPUT_DIR) as td:
print('Filtering out non-standard chromosomes')
chr_filtered_file = os.path.join(td, 'chromosome_filtered.bed')
! zcat {fragments_file_gz} | grep -i -E 'chr[0-9mt]+[^_]' > {chr_filtered_file}
! wc -l {chr_filtered_file}
if blacklist_file is not None:
print('Blacklist regions filtration')
blacklist_filtered_file = os.path.join(td, 'blacklist_filtered.bed')
! bedtools intersect -v -a {chr_filtered_file} -b {blacklist_file} > {blacklist_filtered_file}
! wc -l {blacklist_filtered_file}
# Cleanup
! rm {chr_filtered_file}
else:
blacklist_filtered_file = chr_filtered_file
print('Fragments and peaks intersection')
intersection_file = os.path.join(td, 'intersection.bed')
! bedtools intersect -wa -wb -a {blacklist_filtered_file} -b {peaks_file} > {intersection_file}
! wc -l {intersection_file}
# Cleanup
! rm {blacklist_filtered_file}
idf = pd.read_csv(intersection_file, sep='\t', header=None)
# Cleanup
! rm {intersection_file}
icolnames = ['chr', 'start', 'end', 'barcode', 'reads', 'peak_chr', 'peak_start', 'peak_end']
assert len(idf.columns) >= len(icolnames)
idf.rename(columns={o: n for (o, n) in zip(idf.columns[:len(icolnames)], icolnames)}, inplace=True)
idf = idf[icolnames] # Reorder and drop others
idf = idf.astype({'reads': 'int8'}) # Consume less memory
print('Done intersecting of fragments and peaks')
return idf
def load_or_compute_idf(fragments_file_gz, peaks_file, blacklist_file, cache=None):
# Load or compute fragments and peaks intersection data frame
if cache is None:
cache = os.path.join(OUTPUT_DIR, 'cache')
idfpath = os.path.join(cache, 'idf.csv.gz')
if os.path.exists(idfpath):
print(f'Loading IDF from {idfpath}')
idf = pd.read_csv(idfpath, compression='gzip')
else:
print(f'No IDF file found {idfpath}')
idf = intersect_fragments_and_peaks(fragments_file_gz, peaks_file, blacklist_file)
idf.to_csv(idfpath, index=False, compression='gzip')
print(f'Saved IDF to {idfpath}')
return idf
def filter_cells(idf, noise_threshold=None, duplets_threshold=None):
# 10x Genomics Cell Ranger ATAC-Seq marks duplicates
# 'count' ignores multiple reads per barcode at same position
pidf = pd.pivot_table(idf, values='reads', index=['barcode'], aggfunc='count')
pidf.reset_index(level=0, inplace=True)
print(f'Total barcodes intersecting peaks: {len(pidf)}')
plt.subplots(nrows=1, ncols=3, figsize=(20, 6))
plt.subplot(1, 3, 1)
sns.distplot(np.log10(pidf['reads']))
plt.title('UMI summary coverage distribution')
plt.xlabel('Log10 coverage')
plt.ylabel('Frequency')
# n is for shoulder graph
counts = sorted(pidf['reads'], reverse=True)
df_counts = pd.DataFrame(data={'count': counts, 'n': range(1, len(counts) + 1)})
cells_filter = [True] * len(df_counts)
plt.subplot(1, 3, 2)
if noise_threshold is not None:
noise_filter = df_counts['count'] <= noise_threshold
cells_filter = np.logical_and(cells_filter, np.logical_not(noise_filter))
df_noise = df_counts.loc[noise_filter]
print(f'Noise <= {noise_threshold} reads per barcode: {len(df_noise)}')
plt.plot(np.log10(df_noise['n']),
np.log10(df_noise['count']),
label='Noise', linewidth=3, color='red')
if duplets_threshold is not None:
duplets_filter = df_counts['count'] > duplets_threshold
cells_filter = np.logical_and(cells_filter, np.logical_not(duplets_filter))
df_duplets = df_counts.loc[duplets_filter]
print(f'Duplets > {duplets_threshold} reads per barcode: {len(df_duplets)}')
plt.plot(np.log10(df_duplets['n']),
np.log10(df_duplets['count']),
label='Duplets', linewidth=3, color='orange')
df_cells = df_counts.loc[cells_filter]
cells_filter = [True] * len(pidf)
if noise_threshold is not None:
cells_filter = np.logical_and(cells_filter, pidf['reads'] > noise_threshold)
if duplets_threshold is not None:
cells_filter = np.logical_and(cells_filter, pidf['reads'] < duplets_threshold)
cells = set(pidf.loc[cells_filter]['barcode'])
idfcells = idf.loc[idf['barcode'].isin(cells)]
print(f'Estimated number of cells: {len(cells)}')
plt.title('Cells calling')
plt.plot(np.log10(df_cells['n']),
np.log10(df_cells['count']),
label='Cell', linewidth=3, color='green')
plt.xlabel('Log10 number of barcodes')
plt.ylabel('Log10 fragments overlapping Peaks')
plt.legend(loc='upper right')
plt.subplot(1, 3, 3)
sns.distplot(np.log10(df_cells['count']))
plt.title('Cells UMI summary coverage distribution')
plt.xlabel('Log10 coverage')
plt.ylabel('Frequency')
return idfcells
def idfcells_from_idf(fragments_tsv_gz, peaks_file, blacklist_file, cache=None):
# Intersection of fragments with peaks dataframe
idf = load_or_compute_idf(fragments_tsv_gz, peaks_file, blacklist_file, cache)
with PdfPages(os.path.join(figures_path, 'cell_calling.pdf')) as pdf:
idfcells = filter_cells(idf, noise_threshold=NOISE_THRESHOLD, duplets_threshold=DUPLETS_THRESHOLD)
pdf.savefig()
return idfcells
Pipeline Compute number of different UMIs which intersect with peak. Multiple copies of UMI are ignored.
Cell Ranger ATAC-Seq We produce a count matrix consisting of the counts of fragment ends (or cut sites) within each peak region for each barcode. This is the raw peak-barcode matrix and it captures the enrichment of open chromatin per barcode. The matrix is then filtered to consist of only cell barcodes, which is then used in subsequent analysis such as dimensionality reduction, clustering and visualization.
A barcoded fragment may get sequenced multiple times due to PCR amplification. We mark duplicates in order to identify the original fragments that constitute the library and contribute to its complexity.
def compute_coverage(idfcells):
print('Barcode vs summary fragments overlap with peaks')
# 10x Genomics Cell Ranger ATAC-Seq marks duplicates
# 'count' ignores multiple reads per barcode at same position
cell_peak_cov_df = pd.pivot_table(idfcells, values='reads',
index=['peak_chr', 'peak_start', 'peak_end', 'barcode'],
aggfunc='count').reset_index()
cell_peak_cov_df['peak'] = cell_peak_cov_df['peak_chr'] + ':' + \
cell_peak_cov_df['peak_start'].astype(str) + '-' + cell_peak_cov_df['peak_end'].astype(str)
cell_peak_cov_df.drop(columns=['peak_chr', 'peak_start', 'peak_end'], inplace=True)
print('Transforming dataframe to peaks x barcode format')
coveragedf = pd.pivot_table(cell_peak_cov_df, index='peak', columns='barcode', values='reads').fillna(0)
# Remove extra labels from pivot_table columns
coveragedf.columns = coveragedf.columns.values
coveragedf.index.name = None
return coveragedf
def load_or_compute_coverage(idfcells_computable, cache=None):
if cache is None:
cache = os.path.join(OUTPUT_DIR, 'cache')
cov_mtx_path = os.path.join(cache, 'coverage.npz')
barcodes_path = os.path.join(cache, 'barcodes.txt')
peaks_path = os.path.join(cache, 'peaks.txt')
if os.path.exists(cov_mtx_path):
print(f'Loading coverage matrix {cov_mtx_path}')
csc = sparse.load_npz(cov_mtx_path)
barcodes = [l.rstrip('\n') for l in open(barcodes_path)]
peaks = [l.rstrip('\n') for l in open(peaks_path)]
print(f'Loading into dataframe')
coveragedf = pd.DataFrame(csc.toarray(), index=peaks, columns=barcodes)
coveragedf.fillna(0, inplace=True)
else:
print(f'No coverage matrix found {cov_mtx_path}')
idfcells = idfcells_computable()
coveragedf = compute_coverage(idfcells)
print(f'Saving coverage matrix')
csc = sparse.csc_matrix(coveragedf.values, dtype=int)
sparse.save_npz(cov_mtx_path, csc)
print('Saving barcodes')
with open(barcodes_path, 'w') as f:
f.write('\n'.join(coveragedf.columns))
print('Saving peaks')
with open(peaks_path, 'w') as f:
f.write('\n'.join(coveragedf.index))
print('DF', coveragedf.shape[0], 'peaks x', coveragedf.shape[1], 'cells')
return coveragedf
# Computing main coverage dataframe (Peaks x Barcodes)
coveragedf = load_or_compute_coverage(
lambda: idfcells_from_idf(FRAGMENTS_FILE_10X, PEAKS_FILE, BLACKLIST_FILE)
)
print('Summary coverage per UMI')
umi_coverage_log10 = np.log10(coveragedf.sum())
with PdfPages(os.path.join(figures_path, 'umi_coverage.pdf')) as pdf:
sns.distplot(umi_coverage_log10)
plt.title('Coverage per UMI')
plt.xlabel('Log10 barcodes')
plt.ylabel('Frequency')
pdf.savefig()
print('Peaks per cell with non-zero coverage filtration')
peaks_per_cell = np.count_nonzero(coveragedf, axis=0)
with PdfPages(os.path.join(figures_path, 'peaks_per_cell.pdf')) as pdf:
sns.distplot(np.log10(peaks_per_cell))
plt.title('Peaks with coverage > 0 per cell')
plt.xlabel('Log10 peaks')
plt.ylabel('Frequency')
pdf.savefig()
with PdfPages(os.path.join(figures_path, 'umis.pdf')) as pdf:
print('Summary UMI distribution per factor')
factors = [FACTOR_FUNCTION(bc) for bc in coveragedf.columns]
for k, v in tqdm(FACTORS_MAP.items()):
sns.kdeplot(np.log10(coveragedf[coveragedf.columns[np.flatnonzero(np.equal(factors, k))]].sum()),
label=v)
plt.title('UMI summary coverage distribution')
plt.xlabel('Log10 coverage')
plt.ylabel('Frequency')
print('Cells per peak with non-zero coverage distribution')
cells_per_peak = np.count_nonzero(coveragedf, axis=1)
with PdfPages(os.path.join(figures_path, 'cells_per_peaks.pdf')) as pdf:
sns.distplot(np.log10(cells_per_peak))
plt.title('Cells with coverage > 0 per peak')
plt.xlabel('Log10 cells')
plt.ylabel('Frequency')
pdf.savefig()
# Plot peak coverage vs peak length
with PdfPages(os.path.join(figures_path, 'peaks_coverage_vs_length.pdf')) as pdf:
sns.jointplot(x=np.log10(cells_per_peak),
y=[int(re.split(':|-', p)[2]) - int(re.split(':|-', p)[1]) for p in coveragedf.index],
alpha=0.1)
plt.xlabel('Log10 cells')
plt.ylabel('Peaks length')
plt.suptitle('Peak coverage vs peak length')
pdf.savefig()
plt.show
Pipeline
Cell Ranger ATAC removed features selection with zero variance.
SnapATAC The vast majority of the items in the cell-by-bin count matrix is “0”, some items have abnormally high coverage (often > 200) perhaps due to alignment error. Bins of exceedingly high coverage which likely represent the genomic regions that are invariable between cells such as housekeeping gene promoters were removed. We noticed that filtering bins of extremely low coverage perhaps due to random noise can also improve the robustness of the downstream clustering analysis.
Signac The largely binary nature of scATAC-seq data makes it challenging to perform ‘variable’ feature selection, as we do for scRNA-seq. Instead, we can choose to use only the top n% of features (peaks) for dimensional reduction, or remove features present in less that n cells with the FindTopFeatures function. Here, we will all features, though we note that we see very similar results when using only a subset of features (try setting min.cutoff to ‘q75’ to use the top 25% all peaks), with faster runtimes. Features used for dimensional reduction are automatically set as VariableFeatures for the Seurat object by this function. See for explanation: https://satijalab.org/signac/articles/pbmc_vignette.html
Seurat calculate a subset of features that exhibit high cell-to-cell variation in the dataset (i.e, they are highly expressed in some cells, and lowly expressed in others). We and others have found that focusing on these genes in downstream analysis helps to highlight biological signal in single-cell datasets.
print('Computing mean and std values for peaks...')
peaks_mean_std_df = pd.DataFrame(
{'mean': np.mean(coveragedf.values, axis=1), 'std': np.std(coveragedf.values, axis=1)},
index=coveragedf.index
)
with PdfPages(os.path.join(figures_path, 'feature_selection_before.pdf')) as pdf:
# df is (peaks x barcodes)
sns.jointplot(x='mean', y='std', data=peaks_mean_std_df, alpha=0.05)
pdf.savefig()
plt.show
def feature_selection(peaks_mean_std_df, stdp=1, meanp=99):
print('Total peaks', len(peaks_mean_std_df))
print('Feature selection: filter out most highly covered peaks as alignment errors or housekeeping genes.')
means_high = np.percentile(peaks_mean_std_df['mean'], meanp)
mean_filter = peaks_mean_std_df['mean'] < means_high
print('Feature selection: filter out non-variable peaks.')
stds_low = np.percentile(peaks_mean_std_df['std'], stdp)
std_filter = peaks_mean_std_df['std'] > stds_low
peaks_filter = np.logical_and(mean_filter, std_filter)
print('Filtered peaks', sum(peaks_filter))
return peaks_filter
peaks_filter = feature_selection(peaks_mean_std_df)
with PdfPages(os.path.join(figures_path, 'feature_selection_after.pdf')) as pdf:
sns.jointplot(x='mean', y='std', data=pd.DataFrame({'mean': peaks_mean_std_df.loc[peaks_filter]['mean'],
'std': peaks_mean_std_df.loc[peaks_filter]['std']}),
alpha=0.05)
pdf.savefig()
plt.show
# print('Check for markers')
# t = set(coveragedf.index[np.flatnonzero(peaks_filter)])
# for k,v in marker_regions_peaks_map.items():
# if v not in t:
# print(f'{k} {v} is missing in coveragedf')
# Feature selection is required only for LSA projection!
# # Filter out peaks
# coveragedf = coveragedf.loc[peaks_filter].copy() # Copy is important here
Pipeline Use Cell Ranger ATAC approach and IRLB code for dimensionality reduction.
Cell Ranger uses normalization to median coverage depth in each UMI for PCA analysis,\ and doesn't do any normalization for LSA (default scheme) - we do the same here.
Seurat we employ a global-scaling normalization method “LogNormalize” that normalizes the feature expression measurements for each cell by the total expression, multiplies this by a scale factor (10,000 by default), and log-transforms the result.
SnapATAC does not require population-level peak annotation, and instead assembles chromatin landscapes by directly clustering cells based on the similarity of their genome-wide accessibility profile. Using a regression-based normalization procedure, SnapATAC adjusts for differing read depth between cells.
Cell Ranger ATAC default method is LSA.
IRLBA: The augmented implicitly restarted Lanczos bidiagonalization algorithm finds a few approximate largest singular values and corresponding singular vectors of a sparse or dense matrix using a method of Baglama and Reichel.
PCA if your features are least sensitive (informative) towards the mean of the distribution, then it makes sense to subtract the mean. If the features are most sensitive towards the high values, then subtracting the mean does not make sense.
SVD does not subtract the means but often as a first step projects the data on the mean of all data points. In this way the SVD first takes care of global structure - this is important for IDF transformation.
We found that the combination of these normalization techniques obviates the need to remove the first component. The number of dimensions is fixed to 15 as it was found to sufficiently separate clusters visually and in a biologically meaningful way when tested on peripheral blood mononuclear cells (PBMCs).
def lsa(coveragedf, dim):
print('Normalizing by IDF')
idf = np.log1p(coveragedf.shape[1]) - np.log1p(np.count_nonzero(coveragedf, axis=1))
df_idf = coveragedf.multiply(idf, axis=0).T # Transpose to (barcodes x peaks) format
print('Performing IRLBA without scaling or centering')
# Number of dimensions recommended by Cell Ranger ATAC-Seq algorithm
(_, _, v, _, _) = irlb(df_idf, dim)
# project the matrix to complete the transform: X --> X*v = u*d
df_idf_irlb = df_idf.dot(v)
assert df_idf_irlb.shape == (df_idf.shape[0], dim), 'check dimensions'
# IMPORTANT! We found that the combination of these normalization techniques
# obviates the need to remove the first component.
print('Normalization each barcode data point to unit L2-norm in the lower dimensional space')
idf_irlb_l2 = Normalizer(norm='l2').fit_transform(df_idf_irlb) # (n_samples, n_features)
return pd.DataFrame(idf_irlb_l2, index=df_idf_irlb.index, columns=df_idf_irlb.columns)
# Perform LSA on variable peaks. Peaks_filter is used only for this purpose!
lsa_coords = lsa(coveragedf.loc[peaks_filter], LSA_COMPONENTS)
print('Processing t-SNE on L2-normalized data')
tsne_coords = pd.DataFrame(TSNE(n_components=2).fit_transform(lsa_coords.values),
index=lsa_coords.index,
columns=['tsne1', 'tsne2'])
print('tSNE of log10 coverate')
plot_colored(tsne_coords['tsne1'], tsne_coords['tsne2'], umi_coverage_log10, size=20)
plt.show()
coverage_filter = coveragedf.columns[np.flatnonzero(umi_coverage_log10 <= 4.2)]
print(f'Total cells {coveragedf.shape[1]}, filtered cells {len(coverage_filter)}')
plot_colored(tsne_coords['tsne1'].loc[coverage_filter],
tsne_coords['tsne2'].loc[coverage_filter],
umi_coverage_log10.loc[coverage_filter], size=20)
# Filter by proper indices
# coveragedf = coveragedf[coverage_filter].copy()
# Filter current values or reprocess from scratch
# lsa_coords = lsa_coords.loc[coverage_filter].copy()
# tsne_coords = tsne_coords.loc[coverage_filter].copy()
print('tSNE on LSA normalized vs', FACTOR, FACTORS_MAP)
factors = np.array([FACTOR_FUNCTION(bc) for bc in lsa_coords.loc[coverage_filter].index])
plot_colored(tsne_coords['tsne1'].loc[coverage_filter],
tsne_coords['tsne2'].loc[coverage_filter],
factors, clusters=True, size=20, alpha=0.1)
plt.show()
# # Uncomment to plot split by factor
# print(FACTORS_MAP)
# with PdfPages(os.path.join(figures_path, f'tsne_{FACTOR}_split.pdf')) as pdf:
# plot_colored_split(
# tsne_coords['tsne1'].loc[coverage_filter],
# tsne_coords['tsne2'].loc[coverage_filter],
# factors, alpha=0.1
# )
# pdf.savefig()
Clustering should
Hierarchical clustering
Density-based
Graph clustering
See https://www.sciencedirect.com/science/article/pii/S0098299717300493
def reorder_clusters_by_size(clusters):
clusters = np.array(clusters)
cluster_counter = Counter()
for c in clusters:
cluster_counter[c] += 1
clusters_reord = np.zeros(len(clusters), dtype=int)
for i, (c, n) in enumerate(cluster_counter.most_common()):
clusters_reord[np.array(clusters) == c] = i
return clusters_reord
def get_clusters(x, n_clusters):
# Ward linkage method as default with euclidean distance
print('Clustering', n_clusters)
clusters = AgglomerativeClustering(n_clusters=n_clusters).fit_predict(x).astype(int)
return reorder_clusters_by_size(clusters)
clusters = get_clusters(lsa_coords.loc[coverage_filter], N_CLUSTERS)
clusters_df = pd.DataFrame({'cluster': clusters}, index=coverage_filter)
def clusters_summary(coveragedf, clusters):
# Clusters summary by factors
t = pd.DataFrame({FACTOR: [FACTOR_FUNCTION(bc) for bc in coveragedf.columns],
'cluster': clusters, 'counts': np.ones(len(coveragedf.columns))},
index=coveragedf.columns)
clusters_sizes(t, 'counts')
clusters_factors(t, 'counts', FACTORS_MAP)
def clusters_hierarchy(X, clusters):
# Hierarchical clustering of X - mean normalized LSA coordinates per cluster
t = X.reset_index(drop=True).values
clusters_means_X = pd.DataFrame({
str(c): np.mean(t[np.flatnonzero(clusters == c), :], axis=0) for c in sorted(set(clusters))
}).reset_index()
clusters_linkage = hierarchy.linkage(
clusters_means_X[[str(i) for i in set(clusters)]].T.values, method='ward'
)
# hierarchy.dendrogram(clusters_linkage, orientation='top')
# hierarchy.set_link_color_palette(None) # reset to default after use
return clusters_linkage
clusters_summary(coveragedf[coverage_filter], clusters)
print('tSNE on LSA normalized vs graph clusters')
plot_colored(tsne_coords['tsne1'].loc[coverage_filter],
tsne_coords['tsne2'].loc[coverage_filter],
clusters, clusters=True, show_centers=True, size=20, alpha=0.5)
plt.show()
def show_umi_coverage_per_cell_in_clusters(coveragedf, clusters):
ts = []
plt.figure(figsize=(int(len(set(clusters)) * 0.75), 6))
for c in tqdm(set(clusters)):
t = pd.DataFrame({'umi': np.log10(coveragedf.T.iloc[np.flatnonzero(clusters == c)].sum(axis=1) + 1)})
t['cluster'] = str(c)
ts.append(t)
sns.violinplot(data=pd.concat(ts), x='cluster', y='umi', order=[str(c) for c in sorted(set(clusters))])
plt.title('UMI summary coverage per cell')
plt.xlabel('Cluster')
plt.ylabel('Summary log10 coverage')
plt.legend()
show_umi_coverage_per_cell_in_clusters(coveragedf[coverage_filter], clusters)
plt.show()
def show_cells_per_peak_in_clusters(coveragedf, clusters):
ts = []
plt.figure(figsize=(int(len(set(clusters)) * 0.75), 6))
for c in tqdm(set(clusters)):
t = pd.DataFrame({
'nzpeaks': np.log10(np.count_nonzero(coveragedf.T.iloc[np.flatnonzero(clusters == c)], axis=1) + 1)
})
t['cluster'] = str(c)
ts.append(t)
sns.violinplot(data=pd.concat(ts), x='cluster', y='nzpeaks', order=[str(c) for c in sorted(set(clusters))])
plt.title('Peaks with non-zero coverage distribution per cell')
plt.xlabel('Cluster')
plt.ylabel('Frequency')
plt.legend()
show_cells_per_peak_in_clusters(coveragedf[coverage_filter], clusters)
plt.show()
def clusters_mean_coverage(coveragedf, clusters):
print('Compute mean coverage per cluster')
# Transpose to (barcodes x peaks) format
t = coveragedf.T
# Per cluster mean values for peaks
clusters_means = {str(c): t.loc[clusters == c].mean() for c in sorted(set(clusters))}
return pd.DataFrame(clusters_means).reset_index()
def show_markers_mean_values(coveragedf, clusters, clusters_linkage, marker_regions_peaks_map):
clusters_means_df = clusters_mean_coverage(coveragedf, clusters)
print('Hierarchical clustering of coverage in markers')
t = clusters_means_df[[str(c) for c in set(clusters)]].copy()
t.index = clusters_means_df['index']
mt = [(k, v) for k, v in marker_regions_peaks_map.items() if v in t.index]
t = t.loc[[v for _, v in mt]]
t.index = [k for k, _ in mt]
# zscore per row
# t = pd.DataFrame(zscore(t, axis=1), index=t.index, columns=t.columns)
# 0-1 scale per row
t = t.divide(t.max(axis=1), axis='rows')
sns.clustermap(t, figsize=(1 + int(len(set(clusters)) / 3), 1 + int(len(mt) / 4)), cmap='bwr',
col_linkage=clusters_linkage)
# Visualize markers mean values
clusters_linkage = clusters_hierarchy(lsa_coords.loc[coverage_filter], clusters)
show_markers_mean_values(coveragedf[coverage_filter],
clusters, clusters_linkage, marker_regions_peaks_map)
plt.show()
# Group detected by global clustering
refined_group = (0, 6, 9, 3, 5, 7, 8, 13, 2, 17)
print_mem_usage()
Pipeline Split UMI reads by clusters and use RPKM normalization.
Seurat We created pseudo-bulk ATAC-seq profiles by pooling together cells with for each cell type. Each cell type showed enriched accessibility near canonical marker genes. Chromatin accessibility tracks are normalized to sequencing depth (RPKM normalization) in each pooled group.
def write_bigwig(df, bw_path, step):
chr_lengths = df[['chr', 'end']].groupby('chr').max().reset_index()
with pyBigWig.open(bw_path, 'w') as bw:
bw.addHeader(list(zip(chr_lengths['chr'], chr_lengths['end'])))
for chromosome in tqdm(chr_lengths['chr']):
df_chr = df.loc[df['chr'] == chromosome].sort_values(['start', 'end'])
starts = list(df_chr['start'])
ends = list(df_chr['end'])
reads = list(df_chr['reads'])
chr_length = int(chr_lengths.loc[chr_lengths['chr'] == chromosome]['end'])
values = np.zeros(int(math.floor(chr_length / step)) + 1)
for i in range(len(df_chr)):
# Ignore PCR duplicates here!
# v = reads[i]
v = 1.0
si = int(math.floor(starts[i] / step))
ei = int(math.floor(ends[i] / step))
if ei == si:
values[si] += v
else:
values[si] += v / 2
values[ei] += v / 2
non_zero = values > 0
non_zero_inds = np.flatnonzero(non_zero)
starts_np = non_zero_inds * step
ends_np = (non_zero_inds + 1) * step
values_np = values[non_zero]
# RPM normalization
values_np = values_np * (1_000_000.0 / sum(values_np))
chroms_np = np.array([chromosome] * sum(non_zero))
bw.addEntries(chroms_np, starts=starts_np, ends=ends_np, values=values_np)
def write_cluster_bigwigs(fragments_tsv_gz, barcodes, clusters, bigwigs_path, step):
df_for_bigwigs = get_fragments_and_clusters(fragments_tsv_gz, barcodes, clusters)
for cluster in sorted(set(clusters)):
bw_path = os.path.join(bigwigs_path, f'cluster_{cluster}.bw')
print('Processing', cluster, bw_path)
df_for_bigwigs_cluster = df_for_bigwigs.loc[df_for_bigwigs['cluster'] == cluster]
write_bigwig(df_for_bigwigs_cluster, bw_path, step)
print('Done')
def get_fragments_and_clusters(fragments_tsv_gz, barcodes, clusters):
print('Loading full fragments dataframe')
fdf = pd.read_csv(fragments_tsv_gz, sep='\t',
names=['chr', 'start', 'end', 'barcode', 'reads'], compression='gzip')
print('Aggregating fragments and clusters')
t = pd.merge(left=fdf,
right=pd.DataFrame({'barcode': barcodes, 'cluster': clusters}),
left_on='barcode', right_on='barcode')
return t
fragments_and_clusters = get_fragments_and_clusters(FRAGMENTS_FILE_10X, clusters_df.index, clusters)
# Process cluster fragments.tsv.gz and clusters peaks, long operation!
def create_fragments_peaks_bws(fragments_and_clusters, refined_group):
rgn = '_'.join(map(lambda c: str(c), refined_group))
print(f'Processing {rgn}')
rgn_dir = os.path.join(OUTPUT_DIR, 'cache', rgn)
! mkdir {rgn_dir}
bed_path = os.path.join(rgn_dir, f'cluster_{rgn}.bed')
fragments_path = os.path.join(rgn_dir, f'cluster_{rgn}.tsv.gz')
bw_path = os.path.join(rgn_dir, f'cluster_{rgn}.bw')
t = fragments_and_clusters.loc[fragments_and_clusters['cluster'].isin(refined_group)]
print('Saving reads', bed_path)
t[['chr', 'start', 'end']].to_csv(bed_path, sep='\t', header=False, index=False)
print('Saving fragments', fragments_path)
t[['chr', 'start', 'end', 'barcode', 'reads']].to_csv(fragments_path, sep='\t',
header=False, index=False,
compression='gzip')
print('Saving visualization', bw_path)
write_bigwig(t, bw_path, step=200)
# Prepare input for processing
create_fragments_peaks_bws(fragments_and_clusters, refined_group)
# Memory cleanup
del fragments_and_clusters
rgn = '_'.join(map(lambda c: str(c), refined_group))
rgn_dir = os.path.join(OUTPUT_DIR, 'cache', rgn)
# Call peaks with MACS2
print('Launch MACS2 with command line')
print(f'cd {rgn_dir}; macs2 callpeak --keep-dup all --extsize 200 --shift 100 --nomodel \
-t cluster_{rgn}.bed -f BED -n cluster_{rgn}')
# IMPORTANT: check peaks, important for potential clusters identification without bigwig visualization
rgn_peaks_df = pd.read_csv(os.path.join(rgn_dir, f'cluster_{rgn}_peaks.narrowPeak'), sep='\t', header=None)
rgn_marker_regions_peaks_map = find_peaks_for_markers(rgn_peaks_df, MARKERS_GENES)
# Computing coverage dataframe (Peaks x Barcodes)
rgn_coveragedf = load_or_compute_coverage(
lambda: idfcells_from_idf(
os.path.join(rgn_dir, f'cluster_{rgn}.tsv.gz'),
os.path.join(rgn_dir, f'cluster_{rgn}_peaks.narrowPeak'),
BLACKLIST_FILE,
cache=rgn_dir
),
cache=rgn_dir
)
def factors_summary(coveragedf):
t = pd.DataFrame(
{FACTOR: [FACTOR_FUNCTION(bc) for bc in coveragedf.columns],
'counts': np.ones(len(coveragedf.columns))},
index=coveragedf.columns
)
display(t[[FACTOR, 'counts']].groupby([FACTOR]).sum())
factors_summary(rgn_coveragedf)
# Feature selection
rgn_peaks_filter = feature_selection(pd.DataFrame(
{'mean': np.mean(rgn_coveragedf.values, axis=1), 'std': np.std(rgn_coveragedf.values, axis=1)},
index=rgn_coveragedf.index
))
# Perform LSA on variable peaks. Peaks_filter is used only for this purpose!
rgn_lsa_coords = lsa(rgn_coveragedf.loc[rgn_peaks_filter], LSA_COMPONENTS)
print('Processing t-SNE on L2-normalized data')
rgn_tsne_coords = pd.DataFrame(
TSNE(n_components=2).fit_transform(rgn_lsa_coords.values),
index=rgn_lsa_coords.index,
columns=['tsne1', 'tsne2']
)
# Main difference is here, wether we remove non-various peaks before UMI selection or not!
rgn_umi_coverage_log10 = np.log10(rgn_coveragedf.loc[rgn_peaks_filter].sum())
plot_colored(rgn_tsne_coords['tsne1'], rgn_tsne_coords['tsne2'], rgn_umi_coverage_log10, size=20)
rgn_coverage_filter = rgn_coveragedf.columns[np.flatnonzero(rgn_umi_coverage_log10 <= 4.1)]
print(f'Total cells {rgn_coveragedf.shape[1]}, filtered cells {len(rgn_coverage_filter)}')
plot_colored(rgn_tsne_coords['tsne1'].loc[rgn_coverage_filter],
rgn_tsne_coords['tsne2'].loc[rgn_coverage_filter],
rgn_umi_coverage_log10.loc[rgn_coverage_filter], size=20)
def get_clusters_with_summary(coveragedf, lsa_coords, tsne_coords, marker_regions_peaks_map, n_clusters):
clusters = get_clusters(lsa_coords, n_clusters)
# Summary
clusters_summary(coveragedf, clusters)
plt.show()
# Plot clusters on t-SNE
plot_colored(tsne_coords['tsne1'], tsne_coords['tsne2'],
clusters, clusters=True, show_centers=True, size=20, alpha=0.5)
plt.show()
# Visualize coverage in clusters
show_umi_coverage_per_cell_in_clusters(coveragedf, clusters)
plt.show()
# Visualize markers mean values
clusters_linkage = clusters_hierarchy(lsa_coords, clusters)
show_markers_mean_values(coveragedf, clusters, clusters_linkage, marker_regions_peaks_map)
plt.show()
return clusters
rgn_clusters_map = {}
for n_clusters in [10, 20]:
rgn_clusters_map[n_clusters] = get_clusters_with_summary(
rgn_coveragedf[rgn_coverage_filter],
rgn_lsa_coords.loc[rgn_coverage_filter],
rgn_tsne_coords.loc[rgn_coverage_filter],
rgn_marker_regions_peaks_map,
n_clusters
)
print_mem_usage()
# Configure clustering actions: remove cluster number 19 as dupletes, project others back
clustering_actions = [
(rgn_clusters_map[20], {'preserve': [7, 12, 14, 15, 16], 'remove': [19]}),
(rgn_clusters_map[10], {'preserve': list(set(rgn_clusters_map[10])), 'remove': []})
]
def reorder_clusters_map_by_size(clusters_map):
result_clusters_map = {}
clusters_kvs = list(clusters_map.items())
preserve_clusters = [c for (_, c) in clusters_kvs if c != -1]
for bc, c in zip([bc for (bc, c) in clusters_kvs if c != -1],
reorder_clusters_by_size(preserve_clusters)):
result_clusters_map[bc] = c
return result_clusters_map
refined_clusters_map = {} # Here we'll collect clustering
refined_n_clusters = 0 # This will be used for reordering
rgn_cells = list(rgn_coveragedf[rgn_coverage_filter].columns)
for (clustering, actions) in clustering_actions:
# Remove cells
for bc in [rgn_cells[i] for i in np.flatnonzero(np.isin(clustering, actions['remove']))]:
refined_clusters_map[bc] = -1 # Mark for removal
# Preserve clustering
for i, c in enumerate(actions['preserve']):
for bc in [rgn_cells[i] for i in np.flatnonzero(clustering == c)]:
if bc not in refined_clusters_map:
refined_clusters_map[bc] = refined_n_clusters + i
refined_n_clusters += 1
# Reorder clusters by sizes
refined_clusters_map = reorder_clusters_map_by_size(refined_clusters_map)
refined_n_clusters = len(set(filter(lambda c: c!=-1, refined_clusters_map.values())))
print(f'Result clusters', refined_n_clusters)
print('Result cells', len([c for c in refined_clusters_map.values() if c != -1]))
refined_cells = [bc for bc in rgn_coveragedf.columns if
bc in refined_clusters_map and refined_clusters_map[bc] != -1]
refined_coveragedf = rgn_coveragedf[refined_cells]
refined_lsa_coords = rgn_lsa_coords.loc[refined_cells]
refined_tsne_coords = rgn_tsne_coords.loc[refined_cells]
refined_clusters = [refined_clusters_map[bc] for bc in refined_cells]
# Plot clusters on t-SNE
plot_colored(refined_tsne_coords['tsne1'], refined_tsne_coords['tsne2'], refined_clusters,
clusters=True, show_centers=True, size=20, alpha=0.5)
plt.show()
# Visualize markers mean values
refined_clusters_linkage = clusters_hierarchy(refined_lsa_coords, refined_clusters)
show_markers_mean_values(refined_coveragedf, refined_clusters,
refined_clusters_linkage, rgn_marker_regions_peaks_map)
plt.show()
# print_mem_usage()
# Dupletes to remove
final_clusters_to_remove = {12}
final_clusters_map = {}
n_clusters = len(set(clusters))
for bc, c in zip(coveragedf[coverage_filter].columns, clusters):
if c in refined_group:
if bc in refined_clusters_map:
final_clusters_map[bc] = n_clusters + refined_clusters_map[bc]
else:
if c not in final_clusters_to_remove:
final_clusters_map[bc] = c
# Reorder clusters by sizes
final_clusters_map = reorder_clusters_map_by_size(final_clusters_map)
print(f'Result clusters', len(set(final_clusters_map.values())))
print('Result cells', len([c for c in final_clusters_map.values() if c != -1]))
final_cells = [bc for bc in coveragedf.columns if
bc in final_clusters_map and final_clusters_map[bc] != -1]
final_coveragedf = coveragedf[final_cells]
final_lsa_coords = lsa_coords.loc[final_cells]
final_tsne_coords = tsne_coords.loc[final_cells]
final_clusters = [final_clusters_map[bc] for bc in final_cells]
# Plot clusters on t-SNE
plot_colored(final_tsne_coords['tsne1'], final_tsne_coords['tsne2'], final_clusters,
clusters=True, show_centers=True, size=20, alpha=0.5)
plt.show()
# Visualize markers mean values
final_clusters_linkage = clusters_hierarchy(final_lsa_coords, final_clusters)
show_markers_mean_values(final_coveragedf, final_clusters,
final_clusters_linkage, marker_regions_peaks_map)
plt.show()
# print_mem_usage()
# # print('These clusters into Cell Ranger ATAC coordinates')
# mergeddf1 = pd.merge(pd.DataFrame({'Barcode': tsne_coords.index, 'Cluster': clusters}),
# right=tsne10xdf,
# left_on='Barcode', right_on='Barcode')
# mergeddf2 = pd.merge(pd.DataFrame({'Barcode': tsne_coords.index,
# 'tsne1': tsne_coords['tsne1'],
# 'tsne2': tsne_coords['tsne2']}),
# right=clusters10xdf,
# left_on='Barcode', right_on='Barcode')
# fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(18, 7))
# ax1.title.set_text('Clusters in 10x tsne coordinates')
# plot_colored(mergeddf1[['TSNE-1', 'TSNE-2']].values,
# mergeddf1['Cluster'],
# clusters=True,
# show_centers=True,
# size=20, alpha=0.3, fig_ax=(fig, ax1))
# ax2.title.set_text('10x clusters in tsne coordinates')
# plot_colored(mergeddf2[['tsne1', 'tsne2']].values,
# mergeddf2['Cluster'],
# clusters=True,
# show_centers=True,
# size=20, alpha=0.3, fig_ax=(fig, ax2))
# plt.show()
Pipeline
Use Cell Ranger ATAC to get differential markers.
Cell Ranger ATAC For each peak we perform test cluster vs others, so that several clusters may yield significant results vs others, we take into account only the cluster with greatest mean value.\ Test is is perfromed using Cell Ranger default testing procedure based on Negative Binomial GLM.
# Launch 10x Genomics Cell Ranger DE
def run_10x_differential_expression(df, clusters, fdr=0.05):
""" Compute differential expression for each cluster vs all other cells
Args: df - feature expression data (peak x barcode)
clusters - 0-based cluster labels
fdr - false discovery rate"""
print("Computing params...")
m = df.values
sseq_params = compute_sseq_params(m)
peaks = list(df.index)
cluster_markers = []
for cluster in tqdm(sorted(set(clusters))):
in_cluster = clusters == cluster
group_a = np.flatnonzero(in_cluster)
group_b = np.flatnonzero(np.logical_not(in_cluster))
print(f'Computing DE for cluster {cluster}...')
de_result = sseq_differential_expression(m, group_a, group_b, sseq_params)
de_result['cluster'] = [cluster] * len(de_result)
de_result['peak'] = peaks
passed_fdr = de_result['adjusted_p_value'] < fdr
passed_log2_fc = de_result['log2_fold_change'] > 0
# Filter out different by statistical tests with log2 fold change > 0
passed = np.logical_and(passed_fdr, passed_log2_fc)
print('DE:', sum(passed), 'of', len(de_result),
'FDR:', sum(passed_fdr), 'log2fc>0:', sum(passed_log2_fc))
passed_de = de_result.loc[passed]
cluster_de_markers = passed_de[
['cluster', 'peak', 'norm_mean_a', 'norm_mean_b', 'log2_fold_change', 'p_value', 'adjusted_p_value']
].rename(columns={'norm_mean_a': 'norm_mean_cluster', 'norm_mean_b': 'norm_mean_others'})
cluster_markers.append(cluster_de_markers)
return pd.concat(cluster_markers)
def diff_markers_summary(diff_markers):
print('Number of differential markers per cluster')
t = diff_markers[['cluster']].copy()
t['count'] = 1
display(pd.pivot_table(t, values='count', index=['cluster'], aggfunc=np.sum))
def annotate_diff_markers(diff_markers, closest_genes):
return pd.merge(
left=diff_markers[['cluster', 'norm_mean_cluster', 'norm_mean_others',
'log2_fold_change', 'p_value', 'adjusted_p_value', 'peak']],
right=closest_genes,
left_on='peak',
right_on='peak')
def save_diff_markers(markers_with_genes, path):
print('Saving all the markers', len(markers_with_genes), 'to markers.tsv')
# rearrange columns, sort
t = markers_with_genes[
['chr', 'start', 'end', 'peak',
'cluster', 'norm_mean_cluster', 'norm_mean_others',
'log2_fold_change', 'p_value', 'adjusted_p_value',
'gene', 'distance']
].sort_values(by=['cluster', 'adjusted_p_value'])
t.to_csv(path, sep='\t', index=None)
def get_top_markers(df, unique_clusters, top=100):
return pd.concat([
df.loc[df['cluster'] == c].sort_values(by=['adjusted_p_value']).head(top)
for c in unique_clusters])
def get_diff_markers_z(markers_with_genes, coveragedf, clusters):
print('Computing Z scores for mean values in cluster for top markers')
top_markers = get_top_markers(markers_with_genes, sorted(set(clusters)), top=100)
t = coveragedf.T # Transpose to (barcodes x peaks) format
t = t[list(top_markers['peak'])] # top markers only
t = pd.concat([pd.DataFrame(t.loc[clusters == cluster].mean()).T for cluster in sorted(set(clusters))])
t.index = sorted(set(clusters))
# Transform to Z-score
for c in t.columns:
t[c] = zscore(t[c])
return t.T
def show_top_markers_z(t, xs, ys, markers_with_genes, clusters, n_clusters, top=100):
top_markers = get_top_markers(markers_with_genes, sorted(set(clusters)), top)
nrows, ncols = n_rows_columns(n_clusters)
plt.subplots(nrows=nrows, ncols=ncols, figsize=(5 * ncols, 4 * nrows))
for i, cluster in enumerate(tqdm(set(clusters))):
# Table (marker for cluster x cells)
markers_df = t.loc[top_markers.loc[top_markers['cluster'] == cluster]['peak']].T.copy()
# Z scale for each marker peak across all cells
for c in markers_df.columns:
markers_df[c] = zscore(markers_df[c])
# Average Z score for each cell
zs = markers_df.T.mean()
ax = plt.subplot(nrows, ncols, i + 1)
# Average is much more stable than coverage, we can use linear colormap here
plt.setp(ax, xticks=[], yticks=[])
# Plot zscores scatter plot with 2 segments colormap blue-white-red
cmap = LinearSegmentedColormap.from_list(name='zscore',
colors=[(0, 'darkblue'), ((0 - np.min(zs)) / (np.max(zs) - np.min(zs)), 'white'), (1, 'red')])
sc = ax.scatter(xs, ys, s=5, c=zs, cmap=cmap, edgecolor='none', alpha=1)
# sc = ax.scatter(xs, ys, s=5, c=zs, cmap=plt.cm.seismic, edgecolor='none', alpha=1)
plt.colorbar(sc)
plt.title(f'Cluster {cluster}')
def regions_plot(regions, df, xs, ys, xlabel, ylabel, factors, clusters, binary):
print(f'Coverage z-score, distribution and fraction on non-zero covered peaks w.r.t {FACTOR} and clusters')
fregions = []
for k, peak in regions.items():
if peak in df.index:
fregions.append((k, peak))
else:
print(f'{k} {peak} is missing')
fig = plt.figure(figsize=(4*4, 4*len(fregions)))
grid = plt.GridSpec(nrows=len(fregions), ncols=8, wspace=0.5, hspace=0.5, figure=fig)
for i, (k, peak) in enumerate(fregions):
if peak not in df.index:
print(f'{k} {peak} is missing')
continue
vals = df.loc[peak]
print(k, peak, 'cov>0:', sum(vals > 0), 'of', len(vals), 'max cov:', max(vals))
ax = plt.subplot(grid[i, 0:2])
# Plot zscores scatter plot with 2 segments colormap blue-white-red
zs = zscore(vals)
cmap = LinearSegmentedColormap.from_list(name='zscore',
colors=[(0, 'darkblue'), ((0 - np.min(zs)) / (np.max(zs) - np.min(zs)), 'white'), (1, 'red')])
plt.setp(ax, xticks=[], yticks=[])
ax.scatter(xs, ys, marker='.', c=zs, cmap=cmap, s=2, alpha=0.2)
plt.title(k)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
t = pd.DataFrame({'cluster': clusters, 'factor': factors,
'val': [min(v, 1) for v in vals] if binary else vals})
ax = plt.subplot(grid[i, 2])
# mean non-zero covered peaks by type
tf = pd.pivot_table(t, values='val', index=['factor'], aggfunc='mean').reset_index()
sns.barplot(x='factor', y='val', data=tf)
ax.tick_params(axis='x', labelrotation=90)
plt.ylabel('')
ax = plt.subplot(grid[i, 3:])
# mean non-zero covered peaks by cluster and factor
tc = t.groupby(['cluster', 'factor']).mean().reset_index()
sns.barplot(x='cluster', y='val', data=tc, hue='factor')
plt.ylabel('')
# print('t-SNE based Z-SCORE visualizations for markers, mean z-score for 100 top markers')
# with PdfPages(os.path.join(figures_path, f'tsne_diff_markers_100.pdf')) as pdf:
# show_top_markers_z(coveragedf, tsne_coords['tsne1'], tsne_coords['tsne2'],
# markers_with_genes, clusters, n_clusters, 100)
# pdf.savefig()
# print('t-SNE based Z-SCORE visualizations for markers, z-score for 1 top marker')
# t = coveragedf
# top_markers_1 = get_top_markers(markers_with_genes, n_clusters, top=1)
# top_markers_map = {}
# for i, row in top_markers_1.iterrows():
# top_markers_map[str(row['cluster'])] = row['peak']
# with PdfPages(os.path.join(figures_path, 'tsne_diff_markers_1.pdf')) as pdf:
# regions_plot(top_markers_map, coveragedf,
# tsne_coords['tsne1'], tsne_coords['tsne2'],
# 'tsne1', 'tsne2',
# factors, clusters, True)
# pdf.savefig()
def save_differential_markers(clusters, diff_markers, cluster_peaks_path):
! mkdir -p {cluster_peaks_path}
print('Cleanup peaks_clusters', cluster_peaks_path)
for f in glob.glob(f'{cluster_peaks_path}/markers_*.bed'):
os.remove(f)
mlap_max = (-np.log10(diff_markers.loc[diff_markers['adjusted_p_value'] != 0]['adjusted_p_value'])).max()
print('BED scoring as -log10 adjusted pval')
print('Max of -log10 adjusted pval', mlap_max)
for c in set(clusters):
bed_file = os.path.join(cluster_peaks_path, f'markers_{c}.bed')
markers_cluster = diff_markers.loc[diff_markers['cluster'] == c].sort_values(
['log2_fold_change'], ascending=False
).iloc[::-1]
markers_cluster.index = range(len(markers_cluster))
print('Saving cluster', c, 'marker peaks to', bed_file)
with open(bed_file, 'w') as bed:
for i in range(len(markers_cluster)):
peak = markers_cluster['peak'][i]
ap = markers_cluster['adjusted_p_value'][i]
mlap = -np.log10(ap) if ap != 0.0 else mlap_max
line = '{}\t{}\t{}\t.\n'.format(
re.sub(':|-', '\t', peak),
c,
mlap)
bed.write(line)
print('Done')
Preprocess data for single cell explorer https://ctlab.github.io/SCE
def export_to_cse(df, clusters, tsne_coords, umap_coords, markers,
sce_path, token, name, public, organism,
markers_x_column='gene', markers_gene_column='peak'):
print('SCE path', sce_path)
print('SCE token', token)
print('SCE name', name)
print('SCE public', public)
print('SCE organism', organism)
# Sanity check indexes
cells = list(df.columns)
assert np.array_equal(cells, list(tsne_coords.index))
assert np.array_equal(cells, list(umap_coords.index))
assert len(cells) == len(clusters)
print('Processing', 'dataset.json')
with open(os.path.join(sce_path, 'dataset.json'), 'w') as f:
f.write(json.dumps({
'token': token,
'public': public,
'name': name,
'organism': organism,
'cells': df.shape[1]
}))
print('Processing', 'plot_data.json')
fields = {
"tSNE_1": {
"type": "numeric",
"range": [
min(tsne_coords.loc[df.columns.values]['tsne1']),
max(tsne_coords.loc[df.columns.values]['tsne1'])
]
},
"tSNE_2": {
"type": "numeric",
"range": [
min(tsne_coords.loc[df.columns.values]['tsne2']),
max(tsne_coords.loc[df.columns.values]['tsne2'])
]
},
"UMAP_1": {
"type": "numeric",
"range": [
min(umap_coords.loc[df.columns.values]['umap1']),
max(umap_coords.loc[df.columns.values]['umap1'])
]
},
"UMAP_2": {
"type": "numeric",
"range": [
min(umap_coords.loc[df.columns.values]['umap2']),
max(umap_coords.loc[df.columns.values]['umap2'])
]
},
"Cluster": {
"type": "factor",
"levels": [str(c) for c in set(clusters)]
},
"Age": {
"type": "factor",
"levels": list(FACTORS_MAP.values())
},
"nUmi": {
"type": "numeric",
"range": [
min(df.sum()),
max(df.sum()),
]
}
}
data = pd.DataFrame({
"tSNE_1": tsne_coords.loc[df.columns.values]['tsne1'].tolist(),
"tSNE_2": tsne_coords.loc[df.columns.values]['tsne2'].tolist(),
"UMAP_1": umap_coords.loc[df.columns.values]['umap1'].tolist(),
"UMAP_2": umap_coords.loc[df.columns.values]['umap2'].tolist(),
"Cluster": [str(c) for c in clusters_df.loc[df.columns.values]['cluster']],
"Age": [FACTORS_MAP[FACTOR_FUNCTION(bc)] for bc in df.columns],
"nUmi": df.sum().tolist(),
"_row": list(df.columns)
}).to_dict('records')
annotations = {
"tsne_Cluster_centers": {
"type": "text",
"value": "Cluster",
"coords": [
"tSNE_1",
"tSNE_2"
],
"data": [
{
"Cluster": str(c),
"tSNE_1": float(np.mean(tsne_coords.loc[df.columns.values].values[
np.flatnonzero(clusters_df.loc[df.columns.values]['cluster'] == c), 0])),
"tSNE_2": float(np.mean(tsne_coords.loc[df.columns.values].values[
np.flatnonzero(clusters_df.loc[df.columns.values]['cluster'] == c), 1])),
"Text": str(c)
} for c in set(clusters)
]
},
}
with open(os.path.join(sce_path, 'plot_data.json'), 'w') as f:
f.write(json.dumps({
"fields": fields,
"data": data,
"annotations": annotations,
}))
# This file simply contains gene names, cell barcodes/names and total UMI per cell.
# This file must reflect row and column names of matrix data.h5 which contains expression data.
print('Processing', 'exp_data.json')
with open(os.path.join(sce_path, 'exp_data.json'), 'w') as f:
f.write(json.dumps({
"genes": list(df.index.values),
"barcodes": list(df.columns.values),
"totalCounts": df.sum(axis=0).astype(int).tolist()
}))
print('Processing', 'markers.json')
# Take into account only 100 top markers
with open(os.path.join(sce_path, 'markers.json'), 'w') as f:
f.write(json.dumps({
"Cluster": [
{
'X': row[markers_x_column],
'p_val': row['p_value'],
'p_val_adj': row['adjusted_p_value'],
'avg_logFC': row['log2_fold_change'],
'pct.1': row['norm_mean_cluster'],
'pct.2': row['norm_mean_others'],
'cluster': str(row['cluster']),
'gene': row[markers_gene_column],
'distance': row['distance']
} for index, row in markers.iterrows()
]
}))
print('Processing', 'data.h5')
! rm {sce_path}/data.h5
# Fix missing locks
os.environ['HDF5_USE_FILE_LOCKING'] = 'FALSE'
# Don't use pandas to_hdf5 because of unused overhead.
# IMPORTANT: transpose matrix because of different order!
# Column-major order is used by Fortran, Matlab, R, and most underlying core linear algebra libraries (BLAS).
# Row-major ordering is sometimes called “C” style ordering and column-major ordering “Fortran” style.
# Python/NumPy refers to the orderings in array flags as C_CONTIGUOUS and F_CONTIGUOUS, respectively.
# https://cran.r-project.org/web/packages/reticulate/vignettes/arrays.html
expdf = df.astype(int).clip(upper=1) # Clip coverage to 0-1 for better visualization
with h5py.File(os.path.join(sce_path, 'data.h5'), 'w') as hf:
hf.create_dataset("expression/mat",
data=expdf.T.values, # Column-major ordering!
compression="gzip", compression_opts=9,
dtype='i4') # int4 is required here for SCE, see https://github.com/ctlab/SCE/issues/4
print('Done')
def prepare_sce_for_genes(coveragedf, closest_genes, markers_with_genes):
print(f'Associated genes {len(closest_genes)}')
t = closest_genes[['gene', 'peak']].copy()
t['distance'] = np.abs(closest_genes['distance'])
t = t.loc[t['distance'] < SCE_PEAK_GENE_TSS_MAX_DISTANCE]
print(f'Associated genes with distance to TSS < {SCE_PEAK_GENE_TSS_MAX_DISTANCE} {len(t)}')
print('Building mapping peak -> gene, closest peaks is mapped to gene, second closest with suffix _1, etc')
t.sort_values(by=['gene', 'distance'], inplace=True)
associated_genes_map = {}
prev_gene = None
prev_count = 1
for _, (gene, peak, distance) in t.iterrows():
if prev_gene != gene:
associated_genes_map[peak] = gene
prev_count = 1
prev_gene = gene
else:
associated_genes_map[peak] = f'{gene}_{prev_count}'
prev_count += 1
print('Preparing coverage dataframe per gene')
genes_filtereddf = coveragedf.loc[coveragedf.index.isin(associated_genes_map)].copy()
genes_filtereddf.index = [associated_genes_map[peak] for peak in genes_filtereddf.index]
print('Preparing top markers by gene')
associated_markers_with_genes = \
markers_with_genes.loc[np.abs(markers_with_genes['distance']) < SCE_PEAK_GENE_TSS_MAX_DISTANCE].copy()
print('Updating markers genes names by distance to gene with prefix')
associated_markers_with_genes['gene'] = [
associated_genes_map[f'{chr}:{start}-{end}'] for chr, start, end in
zip(associated_markers_with_genes['chr'],
associated_markers_with_genes['start'],
associated_markers_with_genes['end'])
]
return genes_filtereddf, associated_markers_with_genes
def prepare_full_package1(coveragedf, clusters, lsa_coords, tsne_coords, umap_coords,
marker_regions_peaks_map,
name='final'):
print('1. Saving tSNE coordinates')
tsne_coords.to_csv(os.path.join(OUTPUT_DIR, f'{name}_tsne.tsv'), sep='\t')
print('2. tSNE log10 coverate depth')
with PdfPages(os.path.join(figures_path, f'{name}_tsne.pdf')) as pdf:
plot_colored(tsne_coords['tsne1'], tsne_coords['tsne2'], np.log10(coveragedf.sum()), size=20)
pdf.savefig()
print('3. tSNE', FACTOR, FACTORS_MAP)
factors = np.array([FACTOR_FUNCTION(bc) for bc in lsa_coords.index])
with PdfPages(os.path.join(figures_path, f'{name}_tsne_{FACTOR}.pdf')) as pdf:
plot_colored(tsne_coords['tsne1'], tsne_coords['tsne2'],
factors, clusters=True, size=20, alpha=0.1)
pdf.savefig()
print(f'4. Saving clusters')
n_clusters = len(set(clusters))
clusters_df = pd.DataFrame({'cluster': clusters}, index=lsa_coords.index)
clusters_df.to_csv(os.path.join(OUTPUT_DIR, f'{name}_clusters_{n_clusters}.tsv'))
print('5. Computing summary for clusters by', FACTOR)
print(FACTORS_MAP)
t = pd.DataFrame({FACTOR: [FACTOR_FUNCTION(bc) for bc in coveragedf.columns],
'cluster': clusters, 'counts': np.ones(len(coveragedf.columns))},
index=coveragedf.columns)
print('6. Saving clusters sizes')
with PdfPages(os.path.join(figures_path, f'{name}_clusters_sizes.pdf')) as pdf:
clusters_sizes(t, 'counts')
pdf.savefig()
print('7. Saving clusters composition by', FACTOR)
with PdfPages(os.path.join(figures_path, f'{name}_clusters_{FACTOR}.pdf')) as pdf:
clusters_factors(t, 'counts', FACTORS_MAP)
pdf.savefig()
print('8. Saving UMI per clusters')
with PdfPages(os.path.join(figures_path, f'{name}_clusters_umi.pdf')) as pdf:
show_umi_coverage_per_cell_in_clusters(coveragedf, clusters)
pdf.savefig()
print('9. Saving cells per peak')
with PdfPages(os.path.join(figures_path, f'{name}_clusters_cell_peaks.pdf')) as pdf:
show_cells_per_peak_in_clusters(coveragedf, clusters)
pdf.savefig()
print('10. tSNE clusters')
with PdfPages(os.path.join(figures_path, f'{name}_tsne_clusters.pdf')) as pdf:
plot_colored(tsne_coords['tsne1'], tsne_coords['tsne2'],
clusters, clusters=True, show_centers=True, size=20, alpha=0.5)
pdf.savefig()
print('11. tSNE clusters split')
with PdfPages(os.path.join(figures_path, f'{name}_tsne_clusters_split.pdf')) as pdf:
plot_colored_split(tsne_coords['tsne1'], tsne_coords['tsne2'], clusters, size=5)
pdf.savefig()
print('12. Visualize markers mean values')
clusters_linkage = clusters_hierarchy(lsa_coords, clusters)
with PdfPages(os.path.join(figures_path, f'{name}_makers_heatmap.pdf')) as pdf:
show_markers_mean_values(coveragedf, clusters, clusters_linkage, marker_regions_peaks_map)
pdf.savefig()
print('13. Analysing REGIONS of interest TSNE')
with PdfPages(os.path.join(figures_path, f'{name}_tsne_markers.pdf')) as pdf:
regions_plot(marker_regions_peaks_map, coveragedf,
tsne_coords['tsne1'], tsne_coords['tsne2'],
'tsne1', 'tsne2',
[FACTORS_MAP[f] for f in factors], clusters, True)
pdf.savefig()
print('14. Saving UMAP coordinates to umap.tsv')
umap_coords.to_csv(os.path.join(OUTPUT_DIR, f'{name}_umap.tsv'), sep='\t')
print('15. UMAP log10 coverate depth')
with PdfPages(os.path.join(figures_path, f'{name}_umap.pdf')) as pdf:
plot_colored(umap_coords['umap1'], umap_coords['umap2'], np.log10(coveragedf.sum()), size=20)
pdf.savefig()
print('16. UMAP', FACTOR, FACTORS_MAP)
with PdfPages(os.path.join(figures_path, f'{name}_umap_{FACTOR}.pdf')) as pdf:
factors = [FACTOR_FUNCTION(bc) for bc in lsa_coords.index]
plot_colored(umap_coords['umap1'], umap_coords['umap2'], factors, clusters=True, size=20, alpha=0.01)
pdf.savefig()
print('17. Plot UMAP split by factor')
print(FACTORS_MAP)
with PdfPages(os.path.join(figures_path, f'{name}_umap_{FACTOR}_spit.pdf')) as pdf:
plot_colored_split(umap_coords['umap1'], umap_coords['umap2'], factors, size=5, alpha=0.1)
pdf.savefig()
print('18. UMAP clusters')
with PdfPages(os.path.join(figures_path, f'{name}_umap_clusters.pdf')) as pdf:
plot_colored(umap_coords['umap1'], umap_coords['umap2'],
clusters, clusters=True, show_centers=True, size=20, alpha=0.3)
pdf.savefig()
print('19. UMAP clusters split')
with PdfPages(os.path.join(figures_path, f'{name}_umap_clusters_split.pdf')) as pdf:
plot_colored_split(umap_coords['umap1'], umap_coords['umap2'], clusters, size=5)
pdf.savefig()
print('20. Analysing REGIONS of interest UMAP')
with PdfPages(os.path.join(figures_path, f'{name}_umap_markers.pdf')) as pdf:
regions_plot(marker_regions_peaks_map, coveragedf,
umap_coords['umap1'], umap_coords['umap2'],
'umap1', 'umap2',
[FACTORS_MAP[f] for f in factors], clusters, True)
pdf.savefig()
print('21. Save cluster mean values to table with closest genes information')
clusters_means_df = clusters_mean_coverage(coveragedf, clusters)
closest_genes = locations_closest_genes(list(coveragedf.index), transcripts_tss)
# Restore 'peak' to be able to merge on it
closest_genes['peak'] = closest_genes['chr'] +\
':' + closest_genes['start'].astype(str) +\
'-' + closest_genes['end'].astype(str)
clusters_means_genes_df = pd.concat([closest_genes.reset_index(), clusters_means_df], axis=1)\
.drop(columns=['index'])
clusters_means_genes_df.to_csv(os.path.join(OUTPUT_DIR, f'{name}_clusters_peaks_values.tsv'),
sep='\t', index=False)
print('Done')
def prepare_full_package2(fragments_tsv_gz, coveragedf, clusters,
name='final'):
print('1. Saving bigwigs')
bigwigs_path = os.path.join(OUTPUT_DIR, f'{name}_bigwig')
! mkdir -p {bigwigs_path}
write_cluster_bigwigs(fragments_tsv_gz, list(coveragedf.columns), clusters, bigwigs_path, step=200)
print('Done')
def prepare_full_package3(coveragedf, clusters, tsne_coords, umap_coords,
name='final'):
print('1. Cell Ranger ATAC like differential analysis')
# Each peak is tested independently, no normalization to peak length required.
diff_markers = run_10x_differential_expression(coveragedf, clusters)
print('Total diff markers', len(diff_markers))
diff_markers_summary(diff_markers)
print('2. Saving differential markers')
save_differential_markers(clusters, diff_markers, os.path.join(OUTPUT_DIR, f'{name}_markers'))
print('3. Saving differential markers with closest genes info')
closest_genes = locations_closest_genes(list(coveragedf.index), transcripts_tss)
# Restore 'peak' to be able to merge on it
closest_genes['peak'] = closest_genes['chr'] +\
':' + closest_genes['start'].astype(str) +\
'-' + closest_genes['end'].astype(str)
markers_with_genes = annotate_diff_markers(diff_markers, closest_genes)
save_diff_markers(markers_with_genes, os.path.join(OUTPUT_DIR, f'{name}_markers.tsv'))
print('4. Saving differential markers z scores')
diff_markers_z = get_diff_markers_z(markers_with_genes, coveragedf, clusters)
with PdfPages(os.path.join(figures_path, f'{name}_markers_z.pdf')) as pdf:
sns.clustermap(diff_markers_z, col_cluster=False, row_cluster=False,
figsize=(10, 10), cmap=plt.cm.seismic)
pdf.savefig()
print('5. Saving SCE dataset')
sce_path = os.path.join(OUTPUT_DIR, f'{name}_sce')
! mkdir -p {sce_path}
export_to_cse(coveragedf, clusters, tsne_coords, umap_coords,
markers=markers_with_genes,
sce_path=sce_path,
token=f'{SCE_TOKEN}_{name}',
name=f'{SCE_NAME} {name}',
public=SCE_PUBLIC,
organism=SCE_ORGANISM)
print('6. Saving SCE genes dataset')
genes_sce_path = os.path.join(OUTPUT_DIR, f'{name}_sce_genes')
! mkdir -p {genes_sce_path}
genes_filtereddf, associated_markers_with_genes =\
prepare_sce_for_genes(coveragedf, closest_genes, markers_with_genes)
export_to_cse(genes_filtereddf, clusters, tsne_coords, umap_coords,
markers=associated_markers_with_genes,
sce_path=genes_sce_path,
token=f'{SCE_TOKEN}_{name}_genes',
name=f'{SCE_NAME} {name} by genes',
public=SCE_PUBLIC,
organism=SCE_ORGANISM,
markers_x_column='peak', markers_gene_column='gene')
print('Done')
def prepare_full_package(fragments_tsv_gz, coveragedf, lsa_coords, clusters, marker_regions_peaks_map,
name='final',
tsne_coords=None, umap_coords=None):
if tsne_coords is None:
print('Processing t-SNE')
tsne_coords = pd.DataFrame(TSNE(n_components=2).fit_transform(lsa_coords.values),
index=lsa_coords.index,
columns=['tsne1', 'tsne2'])
if umap_coords is None:
print('Processing UMAP')
umap_coords = pd.DataFrame(umap.UMAP().fit_transform(lsa_coords.values)[:, :2],
index=lsa_coords.index,
columns=['umap1', 'umap2'])
# Sanity check indexes
cells = list(coveragedf.columns)
assert np.array_equal(cells, list(tsne_coords.index))
assert np.array_equal(cells, list(umap_coords.index))
assert len(clusters) == len(cells)
print('Processing part 1')
prepare_full_package1(coveragedf, clusters, lsa_coords, tsne_coords, umap_coords,
marker_regions_peaks_map,
name)
print('Processing part 2')
prepare_full_package2(fragments_tsv_gz, coveragedf, clusters,
name)
print('Processing part 3')
prepare_full_package3(coveragedf, clusters, tsne_coords, umap_coords,
name)
# Don't capture figures
%matplotlib agg
prepare_full_package(
os.path.join(rgn_dir, f'cluster_{rgn}.tsv.gz'),
refined_coveragedf, refined_lsa_coords, refined_clusters,
rgn_marker_regions_peaks_map,
name='nk_t',
tsne_coords=refined_tsne_coords
)
# Plot inline figures
%matplotlib inline
# Don't capture figures
%matplotlib agg
prepare_full_package(
FRAGMENTS_FILE_10X,
final_coveragedf, final_lsa_coords, final_clusters,
marker_regions_peaks_map,
name='pbmc',
tsne_coords=final_tsne_coords
)
# Plot inline figures
%matplotlib inline