Single cell ATAC-seq explorer

This single cell ATAC-Seq analysis pipeline is designed for intergative analysis of dataset, initially processed by 10x Genomics Cell Ranger ATAC. Aggregated datasets are also supported!

In addition to 10x Genomics results it offers:

  • Capable to process aggregated data by 10x Genomics Cell Ranger ATAC.
  • Flexible data preprocessing and normalization methods
  • Summary on different conditions in case of aggregated dataset
  • Different types of clustering followed by heatmap and t-SNE/UMAP visualizations in low dimensions space
  • Top cluster markers visualization as heatmap on t-SNE/UMAP plot
  • Closest genes annotations for peaks and clusters
  • Annotated markers analysis
  • BigWig and BED files for clusters and markers ready-to-be-visualized in JBR Genome Browser by JetBrains Research
  • Data preparation for single cell explorer by Artyomov Lab, Washington University in St.Louis
  • Save all the figures in PDF format - ready for publication

Required 10x Genomics Cell Cell Ranger ATAC files:

  • fragments.tsv - fragments matrix file provided by Cell Ranger
  • peaks.bed - peaks file (union of peaks) in case of merging
  • clusters.csv - Graph clustering after median normalization, IDF scaling, SVD projection and L2 normalization
  • projection.csv - T-SNE 2d project of all the cells

Pipeline steps:

  • Cell Calling
  • Peak barcode matrix
  • UMI normalization
  • Feature selection
  • Dimensionality reduction
  • t-SNE / UMAP
  • Differential markers analysis
  • Supervised annotation of clusters by gene markers

Other pipelines:

Questions and comments are welcome at oleg dot shpynov at jetbrains dot com.

In [1]:
# Ensure all the libraries are installed
%matplotlib inline
%config InlineBackend.figure_format ='retina'

import gc
import glob
import math
import matplotlib.pyplot as plt
import numpy as np
# Workaround for the issue: https://github.com/pandas-dev/pandas/issues/26314
# pip install pandas==0.21
import pandas as pd
import os
import re
import sys
import seaborn as sns
import umap
sns.set_style("whitegrid")
import re
import tempfile
import glob
import pyBigWig
import json
import h5py
from scipy import io, sparse

from matplotlib.colors import LinearSegmentedColormap
from collections import Counter
from sklearn.cluster import AgglomerativeClustering
from sklearn.manifold import TSNE
from sklearn.preprocessing import Normalizer
from irlb import irlb
from pybedtools import BedTool
from tqdm.auto import tqdm
from scipy.stats import zscore
from matplotlib.backends.backend_pdf import PdfPages

from diffexp import *

# Fix random seed
np.random.seed(0)
In [2]:
# Configuration of 10x files
FRAGMENTS_FILE_10X = '/mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/BAAJ-AS009/outs/fragments.tsv.gz'
CLUSTERS_FILE_10X = '/mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/BAAJ-AS009/outs/analysis/clustering/graphclust/clusters.csv'
TSNE_FILE_10X = '/mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/BAAJ-AS009/outs/analysis/tsne/2_components/projection.csv'
PEAKS_FILE_10X = '/mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/BAAJ-AS009/outs/peaks.bed'

# See https://www.encodeproject.org/annotations/ENCSR636HFF/
BLACKLIST_FILE = '/mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/ENCFF547MET.bed'

# ! wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.annotation.gtf.gz
# ! wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/gencode.v32.annotation.gtf.gz
# ! gunzip *.gtf.gz
GTF_FILE = '/mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/gencode.vM25.annotation.gtf'

# Dowload DNAse hypersensitivity sites from encodeproject.org
# representative DNase hypersensitivity sites for hg38 is missing
# ! wget https://www.encodeproject.org/files/ENCFF108APF/@@download/ENCFF108APF.bed.gz
# ! gunzip *.gz
DNASE_FILE = '/mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/ENCFF108APF.bed'
In [3]:
# Factors configuration
FACTOR = 'type'
FACTORS_MAP = {1: 'WT', 2: 'WT + IL-12', 3: 'BATF3 KO', 4: 'BATF3 KO + IL-12'}

# Aggregated barcodes always have suffix as a marker of original track
def FACTOR_FUNCTION(x):
    for k in [1, 2, 3, 4]:
        if x.endswith(f'-{k}'):
            return k
In [4]:
# Markers configuration
MARKERS_REGIONS = {
    'Clec4a4': 'chr6:122989896-122990500',
    'Lysmd2': 'chr9:75625503-75626285',
    'Ube2e2': 'chr14:18893297-18894325',
    'H2-Eb2': 'chr17:34325617-34326084',
    'Nudt17': 'chr3:96708301-96708735',
#     'Ccr7': 'chr11:99155002-99155345',
    'Tnfrsf4': 'chr4:156013831-156013972',
    'Socs2': 'chr10:95415887-95417583',
    'Clec4a3': 'chr6:122952358-122953012',
    'Csf1r': 'chr18:61106717-61108648',
    'Adgre4': 'chr17:55749859-55750075',
    'Cd300ld': 'chr11:114989780-114990251',
    'CD24a': 'chr10:43578284-43580870',
    'Xcr1': 'chr9:123863360-123863837',
    'Snx22': 'chr9:66069435-66069780',
    'Cxcl9': 'chr5:92327859-92328300',
    'Cd226': 'chr18:89176811-89177066',
    'Kcne3': 'chr7:100178463-100179065',
    'Clec10a': 'chr11:70165825-70166889',
    'Selenop': 'chr15:3270458-3271525',
    'Mki67': 'chr7:135715678-135716658',
    'Ube2c': 'chr2:164769517-164769967',
    'Adgrg1': 'chr8:94984224-94985133',
    'Adgrg3': 'chr8:95017624-95017756',
    'Egfl7': 'chr2:26578596-26578738',
}
In [5]:
# Output configuration
OUTPUT_DIR = '/mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline'
figures_path = os.path.join(OUTPUT_DIR, 'figures')
! mkdir -p {figures_path}

Pipeline utils

In [353]:
# Used memory analysis utility
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

# Cleanup
gc.collect()

# Print all allocated variables
def print_mem_usage():
    for name, size in sorted(((name, sys.getsizeof(value)) for name, value in globals().items()),
                             key= lambda x: x[1],
                             reverse=True)[:10]:
        print("Global {:>30}: {:>8}".format(name, sizeof_fmt(size)))    
    for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
                             key= lambda x: x[1],
                             reverse=True)[:10]:
        print("Local {:>30}: {:>8}".format(name, sizeof_fmt(size)))

10x Cell Ranger results

In [8]:
clusters10xdf = pd.read_csv(CLUSTERS_FILE_10X, sep=',')
clusters10xdf['Cluster'] = clusters10xdf['Cluster'].astype(int) - 1 # Clusters start with 1 in 10x
clusters10xdf[FACTOR] = [FACTOR_FUNCTION(bc) for bc in clusters10xdf['Barcode']]
# display(clusters10xdf.head())
print('Total clusters', len(set(clusters10xdf['Cluster'])))
Total clusters 24
In [192]:
def clusters_sizes(df, value):
    df_sum = df[['cluster', value]].groupby(['cluster']).sum().reset_index()
    plt.figure(figsize=(10, 4))
    cmap=plt.cm.get_cmap('jet', len(df_sum))
    sns.barplot(x=df_sum.index, y=df_sum[value],
                palette=[cmap(i) for i in range(len(df_sum))])


def clusters_factors(df, value, FACTORS_MAP):
    df_sum = df[['cluster', value]].groupby(['cluster']).sum().reset_index()    
    df_sum_factor = df[['cluster', FACTOR, value]].groupby(['cluster', FACTOR]).sum().reset_index()
    plt.figure(figsize=(10, 4))
    cmap=plt.cm.get_cmap('jet', len(FACTORS_MAP))
    
    bottom = np.zeros(len(df_sum))
    for i, (k,v) in enumerate(FACTORS_MAP.items()):            
            heights = np.array(df_sum_factor.loc[df_sum_factor[FACTOR] == k][value]) / np.array(df_sum[value]) * 100.0
            plt.bar(x = df_sum.index, height = heights, bottom=bottom, color = cmap(i), width=0.8)
            bottom += heights

    # Cosmetics
    plt.xticks(df_sum.index)
    plt.gca().xaxis.grid(False)
    plt.gca().margins(x=0.005)
    
    legend_colors = [plt.Rectangle((0,0), 1, 1, fc=cmap(i), edgecolor = 'none') 
                         for i in range(len(FACTORS_MAP))]
    legend_names = [v for _,v in FACTORS_MAP.items()]
    l = plt.legend(legend_colors, legend_names, prop={'size': 10}, bbox_to_anchor=(1.01, 0.99))
    l.draw_frame(False)

    plt.ylabel('Cluster composition ' + FACTOR + ' %')
    plt.xlabel('Cluster')
In [193]:
print('Size of clusters analysis')
t = clusters10xdf.copy()
t['counts'] = np.ones(len(t))
t.rename(columns={'Cluster': 'cluster'}, inplace=True)
t.index = t['Barcode']
print('Summary by', FACTOR, FACTORS_MAP)
display(t[[FACTOR, 'counts']].groupby([FACTOR]).sum())

clusters_sizes(t[['cluster', 'counts', FACTOR]], 'counts')
plt.show()

clusters_factors(t[['cluster', 'counts', FACTOR]], 'counts', FACTORS_MAP)
plt.show()

# Cleanup memory
del t
gc.collect()
Size of clusters analysis
Summary by type {1: 'WT', 2: 'WT + IL-12', 3: 'BATF3 KO', 4: 'BATF3 KO + IL-12'}
counts
type
1 10312.0
2 11055.0
3 6995.0
4 9383.0
Out[193]:
16433
In [91]:
def plot_colored(data, c,
                 clusters=False, show_centers=False, 
                 size=10, alpha=0.5, no_ticks=True, fig_ax=None):
    """ Plot colored scatterplot """
    cmap = plt.cm.get_cmap('jet', len(set(c))) if clusters else plt.cm.viridis
    if fig_ax is not None:
        fig, ax = fig_ax
    else:
        fig, ax = plt.subplots(1, figsize=(10, 8))
    sc = ax.scatter(data[:, 0], data[:, 1], 
                    c=c, cmap=cmap,
                    marker='o',
                    alpha=alpha,
                    s=size)
    mesh = ax.get_children()[0]
    if no_ticks:
        plt.setp(ax, xticks=[], yticks=[])
    if clusters and show_centers:
        # For each cluster, we add a text with name
        for cluster in set(c):
            cluster_mean = data[np.flatnonzero(c == cluster), :].mean(axis=0)
            ax.text(cluster_mean[0], cluster_mean[1], str(cluster), 
                     horizontalalignment='center', size='large', color='black', weight='bold')            
    if clusters:
        cbar = plt.colorbar(mesh, ax=ax, boundaries=np.arange(len(set(c)) + 1) + min(c) - 0.5)
        cbar.set_ticks(np.arange(len(set(c))) + min(c))
        cbar.set_ticklabels(list(set(c)))
    else:
        plt.colorbar(mesh, ax=ax)

def n_rows_columns(n, maxcols=5):
    ncols = max([r for r in range(1, min(maxcols, math.ceil(math.sqrt(n)) + 1)) if n % r == 0])
    nrows = int(n / ncols)
    return nrows, ncols

    
def plot_colored_split(data, c, size=10, alpha=0.5, contrast = 100):
    """ Plot colored scatterplot split by factors"""
    cmap = plt.cm.get_cmap('jet', len(set(c)))
    nrows, ncols = n_rows_columns(len(set(c)))
    plt.subplots(nrows=nrows, ncols=ncols, figsize=(5 * ncols, 5 * nrows))
    xmin, xmax = np.min(data[:,0]), np.max(data[:,0])
    xlim = (xmin - 0.1 * (xmax - xmin), xmax + 0.1 * (xmax - xmin)) # +10% margin
    ymin, ymax = np.min(data[:,1]), np.max(data[:,1])
    ylim = (ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)) # +10% margin
    for i, f in enumerate(tqdm(set(c))):
        ax = plt.subplot(nrows, ncols, i + 1)
        plt.setp(ax, xticks=[], yticks=[])
        plt.xlim(xlim)
        plt.ylim(ylim)
        plt.title(str(f))
        # Plot slightly visible other as gray
        nfdata = data[np.flatnonzero(np.logical_not(np.equal(c, f))), :]
        plt.scatter(nfdata[:,0], nfdata[:,1], 
            color='gray',
            marker='o',
            alpha=alpha / contrast,
            s=size)
        # And factor data
        fdata = data[np.flatnonzero(np.equal(c, f)), :]        
        plt.scatter(fdata[:,0], fdata[:,1], 
                    color=cmap(i),
                    marker='o',
                    alpha=alpha,
                    s=size)
In [92]:
print('Cell Ranger ATAC original T-SNE visualization')
tsne10xpath = TSNE_FILE_10X
tsne10xdf = pd.read_csv(tsne10xpath, sep=',')

mergeddf = pd.merge(left=tsne10xdf,
                    right=clusters10xdf, 
                    left_on='Barcode', right_on='Barcode')

plot_colored(mergeddf[['TSNE-1', 'TSNE-2']].values, mergeddf['Cluster'], 
             clusters=True,
             show_centers=True,
             size=20, alpha=0.5)
plt.show()
Cell Ranger ATAC original T-SNE visualization
In [35]:
print('tSNE on LSA normalized vs', FACTOR, FACTORS_MAP)
plot_colored(mergeddf[['TSNE-1', 'TSNE-2']].values, clusters10xdf[FACTOR], clusters=True, size=20, alpha=0.1)
plt.show()
tSNE on LSA normalized vs type {1: 'WT', 2: 'WT + IL-12', 3: 'BATF3 KO', 4: 'BATF3 KO + IL-12'}
In [37]:
with PdfPages(os.path.join(figures_path, '10x_tsne_clusters_split.pdf')) as pdf:
    plot_colored_split(mergeddf[['TSNE-1', 'TSNE-2']].values, clusters10xdf[FACTOR], size=5, alpha=0.1)
    pdf.savefig()

Peaks file stats

In [38]:
with PdfPages(os.path.join(figures_path, 'peaks_length.pdf')) as pdf:
    peaks_df = pd.read_csv(PEAKS_FILE_10X, sep='\t', header=None)
    print('Peaks', len(peaks_df))
    sns.distplot(np.log10(peaks_df[2] - peaks_df[1]))
    plt.title('Peak length distribution')
    plt.xlabel('Log10 Length')
    plt.ylabel('Frequency')
Peaks 125944

Overlap of peaks with representative DNAse hypersensitive sites

In [39]:
if DNASE_FILE is not None:
    dnase = BedTool(DNASE_FILE)
    peaks_file = BedTool(PEAKS_FILE_10X)
    peak_count = peaks_file.count()
    overlap = peaks_file.intersect(dnase, wa=True, u=True).count()
    peak_by_dnase = 0 if overlap == 0 else overlap * 100.0 / peak_count
    overlap = dnase.intersect(peaks_file, wa=True, u=True).count()
    dnase_by_peak = overlap * 100.0 / dnase.count()
    print('Fraction of peaks overlapping with representative DNAse', int(peak_by_dnase), '%')
    print('Fraction of representative DNAse overlapping with peaks', int(dnase_by_peak), '%')
Fraction of peaks overlapping with representative DNAse 93 %
Fraction of representative DNAse overlapping with peaks 18 %

The pipeline

Cell Calling

Pipeline uses noise_threshold and duplets_threshold.


Cell Ranger ATAC aggregation: when combining data from multiple GEM groups, the cellranger-atac aggr pipeline automatically equalizes the sensitivity of the groups before merging, which is the recommended approach in order to avoid the batch effect introduced by sequencing depth. Default method: Subsample fragments from higher-depth GEM wells until they all have an equal number of unique fragments per cell.

For each barcode:

  • we have the record of mapped high-quality fragments that passed all filters (the fragments.tsv file).
  • Having determined peaks prior to this, we use the number of fragments that overlap any peak regions.
  • Separate the signal from noise.

This works better in practice as compared to naively using the number of fragments per barcode. We first subtract a depth-dependent fixed count from all barcode counts to model whitelist contamination. This fixed count is the estimated number of fragments per barcode that originated from a different GEM, assuming a contamination rate of 0.02. Then we fit a mixture model of two negative binomial distributions to capture the signal and noise. Setting an odds ratio of 1000, we separate the barcodes that correspond to real cells from the non-cell barcodes.

Scasat If a cell has open peaks below a user defined threshold (default: 50 peaks) in the peak accessibility matrix we would remove that cell from subsequent analysis. Also peaks not observed across a user defined number of valid cells (default: 10 cells) are not considered for further downstream analysis.

SnapATAC alternative approach - 1kb, 5kb and 10kb resolution.

Existing computational methods rely on pre-defined regions of transposase accessibility identified from the aggregate signals. ...

Limitations:

  1. It requires sufficient number of single cell profiles to create robust aggregate signal for peak calling.
  2. The cell type identification is biased toward the most abundant cell types in the tissues.
  3. These techniques lack the ability to reveal regulatory elements in the rare cell populations which are underrepresented in the aggregate signal. This concern is critical, for example, in brain tissue, where key neuron types may represent less than 1% of all cells while still playing a critical role in the neural circuit.
In [40]:
# Fragments file provided by cell ranger
if os.path.exists(FRAGMENTS_FILE_10X):
    # BedTools doesn't work with gz file, so unzip it
    !gunzip {FRAGMENTS_FILE_10X}
fragments_file = re.sub('\.gz', '', FRAGMENTS_FILE_10X)
In [41]:
# Intersect using bedtools rather than pybedtools, because they are too slow for files of this size!
def intersect_fragments_and_peaks(fragments_file, peaks_file, blacklist_file):
    print('Fragments')
    ! wc -l {fragments_file}
    idf = None
    print('Blacklist')
    ! wc -l {blacklist_file}
    print('Peaks')
    ! wc -l {peaks_file}

    with tempfile.TemporaryDirectory(prefix='pipeline', dir=OUTPUT_DIR) as td:
        print('Filtering out non-standard chromosomes')
        chr_filtered_file = os.path.join(td, 'chromosome_filtered.bed')
        ! cat {fragments_file} | grep -i -E 'chr[0-9mt]+[^_]' > {chr_filtered_file}
        ! wc -l {chr_filtered_file}

        print('Blacklist regions filtration')
        blacklist_filtered_file = os.path.join(td, 'blacklist_filtered.bed')
        ! bedtools intersect -v -a {chr_filtered_file} -b {blacklist_file} > {blacklist_filtered_file}
        ! wc -l {blacklist_filtered_file}
        # Cleanup
        ! rm {chr_filtered_file}

        print('Fragments and peaks intersection')
        intersection_file = os.path.join(td, 'intersection.bed')
        ! bedtools intersect -wa -wb -a {blacklist_filtered_file} -b {peaks_file} > {intersection_file}
        ! wc -l {intersection_file}
        # Cleanup
        ! rm {blacklist_filtered_file}

        idf = pd.read_csv(intersection_file, sep='\t', header=None)
        # Cleanup
        ! rm {intersection_file}        
        icolnames = ['chr', 'start', 'end', 'barcode', 'reads', 'peak_chr', 'peak_start', 'peak_end']
        assert len(idf.columns) >= len(icolnames)
        idf.rename(columns={o: n for (o, n) in zip(idf.columns[:len(icolnames)], icolnames)}, inplace=True)
        idf = idf[icolnames] # Reorder and drop others
        idf = idf.astype({'reads': 'int8'}) # Consume less memory
    print('Done intersecting of fragments and peaks')
    return idf
In [238]:
# Load or compute idf
idfpath = os.path.join(OUTPUT_DIR, 'idf.csv.gz')
if os.path.exists(idfpath):
    print(f'Loading IDF from {idfpath}')
    idf = pd.read_csv(idfpath, compression='gzip')
else:
    idf = intersect_fragments_and_peaks(fragments_file, PEAKS_FILE_10X, BLACKLIST_FILE)
    idf.to_csv(idfpath, index=False, compression='gzip')
    print(f'Saved IDF to {idfpath}')   
Loading IDF from /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/idf.csv.gz
Total barcodes 873234
In [242]:
def filter_cells(pidf, noise_threshold=None, duplets_threshold=None):
    # 10x Genomics Cell Ranger ATAC-Seq marks duplicates
    # 'count' ignores multiple reads per barcode at same position
    pidf = pd.pivot_table(idf, values='reads', index=['barcode'], aggfunc='count')
    pidf.reset_index(level=0, inplace=True)
    
    print('Total barcodes', len(pidf))    
    plt.subplots(nrows=1, ncols=3, figsize=(20, 6))
    
    plt.subplot(1, 3, 1)
    sns.distplot(np.log10(pidf['reads']))
    plt.title('UMI summary coverage distribution')
    plt.xlabel('Log10 coverage')
    plt.ylabel('Frequency')    

    # n is for shoulder graph
    counts = sorted(pidf['reads'], reverse=True)
    df_counts = pd.DataFrame(data={'count': counts, 'n': range(1, len(counts) + 1)})
    cells_filter = [True] * len(df_counts)

    plt.subplot(1, 3, 2)    
    if noise_threshold is not None:
        noise_filter = df_counts['count'] <= noise_threshold
        cells_filter = np.logical_and(cells_filter, np.logical_not(noise_filter))
        df_noise = df_counts.loc[noise_filter]
        print(f'Noise <= {noise_threshold}: {len(df_noise)}')
        plt.plot(np.log10(df_noise['n']), 
                 np.log10(df_noise['count']), 
                 label='Noise', linewidth=3, color='red')        

    if duplets_threshold is not None:
        duplets_filter = df_counts['count'] > duplets_threshold
        cells_filter = np.logical_and(cells_filter, np.logical_not(duplets_filter))
        df_duplets = df_counts.loc[duplets_filter]
        print(f'Duplets > {duplets_threshold}: {len(df_duplets)}')        
        plt.plot(np.log10(df_duplets['n']), 
                 np.log10(df_duplets['count']), 
                 label='Duplets', linewidth=3, color='orange')        

    df_cells = df_counts.loc[cells_filter]

    cells_filter = [True] * len(pidf)
    if noise_threshold is not None:
        cells_filter = np.logical_and(cells_filter, pidf['reads'] > noise_threshold)
    if duplets_threshold is not None:
        cells_filter = np.logical_and(cells_filter, pidf['reads'] < duplets_threshold)
        
    cells = set(pidf.loc[cells_filter]['barcode'])
    idfcells = idf.loc[[c in cells for c in idf['barcode']]]
    print('Estimated number of cells', len(cells))
    plt.title('Cells calling')
    plt.plot(np.log10(df_cells['n']), 
             np.log10(df_cells['count']), 
             label='Cell', linewidth=3, color='green')

    plt.xlabel('Log10 number of barcodes')
    plt.ylabel('Log10 fragments overlapping Peaks')
    plt.legend(loc='upper right')

    plt.subplot(1, 3, 3)
    sns.distplot(np.log10(df_cells['count']))
    plt.title('Cells UMI summary coverage distribution')
    plt.xlabel('Log10 coverage')
    plt.ylabel('Frequency')
    
    return idfcells
In [267]:
with PdfPages(os.path.join(figures_path, 'cell_calling.pdf')) as pdf:
    idfcells = filter_cells(idf, noise_threshold=400, duplets_threshold=40000)
    pdf.savefig()
Total barcodes 873234
Noise <= 400: 839418
Duplets > 40000: 287
Estimated number of cells 33529
In [268]:
# Cleanup memory
del idf
gc.collect()
Out[268]:
535638

Peak-Barcode Matrix

Pipeline Compute number of different UMIs which intersect with peak. Multiple copies of UMI are ignored.


Cell Ranger ATAC-Seq We produce a count matrix consisting of the counts of fragment ends (or cut sites) within each peak region for each barcode. This is the raw peak-barcode matrix and it captures the enrichment of open chromatin per barcode. The matrix is then filtered to consist of only cell barcodes, which is then used in subsequent analysis such as dimensionality reduction, clustering and visualization.

A barcoded fragment may get sequenced multiple times due to PCR amplification. We mark duplicates in order to identify the original fragments that constitute the library and contribute to its complexity.

In [269]:
def cellsdf_to_coverage(idfcells):
    print('Barcode vs summary fragments overlap with peaks')
    # 10x Genomics Cell Ranger ATAC-Seq marks duplicates
    # 'count' ignores multiple reads per barcode at same position
    cell_peak_cov_df = pd.pivot_table(idfcells, values='reads', 
                            index=['peak_chr', 'peak_start', 'peak_end', 'barcode'], 
                            aggfunc='count').reset_index()

    cell_peak_cov_df['peak'] = cell_peak_cov_df['peak_chr'] + ':' + \
        cell_peak_cov_df['peak_start'].astype(str) + '-' + cell_peak_cov_df['peak_end'].astype(str)
    cell_peak_cov_df.drop(columns=['peak_chr', 'peak_start', 'peak_end'], inplace=True)

    print('Transforming dataframe to peaks x barcode format')
    coveragedf = pd.pivot_table(cell_peak_cov_df, index='peak', columns='barcode', values='reads').fillna(0)
    # Remove extra labels from pivot_table columns
    coveragedf.columns = coveragedf.columns.values
    coveragedf.index.name = None
    return coveragedf
In [291]:
cov_mtx_path = os.path.join(OUTPUT_DIR, 'coverage.mtx')

if os.path.exists(cov_mtx_path):
    print(f'Loading coverage matrix {cov_mtx_path}')
    csc = io.mmread(cov_mtx_path)
    barcodes = [l.rstrip('\n') for l in open(os.path.join(OUTPUT_DIR, 'barcodes.txt'))]
    peaks = [l.rstrip('\n') for l in open(os.path.join(OUTPUT_DIR, 'peaks.txt'))]
    print(f'Loading into dataframe')
    coveragedf = pd.DataFrame(csc.toarray(), index=peaks, columns=barcodes)
    coveragedf.fillna(0, inplace=True)
    # Cleanup memory
    del csc
    del barcodes
    del peaks
    gc.collect()    
else:
    coveragedf = cellsdf_to_coverage(idfcells)
    print(f'Saving coverage matrix coverage.mtx')
    csc = sparse.csc_matrix(coveragedf.values, dtype=int)
    io.mmwrite(cov_mtx_path, csc)

    print('Saving barcodes.txt')
    with open(os.path.join(OUTPUT_DIR, 'barcodes.txt'), 'w') as f:
        for b in coveragedf.columns.values:
            f.write(b + '\n')

    print('Saving peaks.txt')
    with open(os.path.join(OUTPUT_DIR, 'peaks.txt'), 'w') as f:
        for p in coveragedf.index.values:
            f.write(p + '\n')    
    # Cleanup memory
    del csc
    gc.collect()    

print('DF', coveragedf.shape[0], 'peaks x', coveragedf.shape[1], 'cells')
Loading coverage matrix /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/coverage.mtx
Loading into dataframe
DF 123113 peaks x 31860 cells
In [271]:
# Cleanup memory
del idfcells
gc.collect()
Out[271]:
14
In [292]:
with PdfPages(os.path.join(figures_path, 'umis.pdf')) as pdf:
    print('Summary UMI distribution per factor')
    factors = [FACTOR_FUNCTION(bc) for bc in coveragedf.columns]
    for k, v in tqdm(FACTORS_MAP.items()):
        sns.kdeplot(np.log10(coveragedf[coveragedf.columns[np.flatnonzero(np.equal(factors, k))]].sum()), 
                    label=v)
    plt.title('UMI summary coverage distribution')
    plt.xlabel('Log10 coverage')
    plt.ylabel('Frequency')
Summary UMI distribution per factor

In [293]:
print('Cells per peak with non-zero coverage distribution')
cells_per_peak = np.count_nonzero(coveragedf, axis=1)
with PdfPages(os.path.join(figures_path, 'cells_per_peaks.pdf')) as pdf:
    sns.distplot(np.log10(cells_per_peak))
    plt.title('Cells with coverage > 0 per peak')
    plt.xlabel('Log10 cells')
    plt.ylabel('Frequency')
    pdf.savefig()
Cells per peak with non-zero coverage distribution
In [294]:
print('Peaks per cell with non-zero coverage filtration')
peaks_per_cell = np.count_nonzero(coveragedf, axis=0)
with PdfPages(os.path.join(figures_path, 'peaks_per_cell.pdf')) as pdf:
    sns.distplot(np.log10(peaks_per_cell))
    plt.title('Peaks with coverage > 0 per cell')
    plt.xlabel('Log10 peaks')
    plt.ylabel('Frequency')
    pdf.savefig()
Peaks per cell with non-zero coverage filtration

Normalization

Cell Ranger uses normalization to median coverage depth in each UMI - we do the same here.

Seurat we employ a global-scaling normalization method “LogNormalize” that normalizes the feature expression measurements for each cell by the total expression, multiplies this by a scale factor (10,000 by default), and log-transforms the result.

SnapATAC does not require population-level peak annotation, and instead assembles chromatin landscapes by directly clustering cells based on the similarity of their genome-wide accessibility profile. Using a regression-based normalization procedure, SnapATAC adjusts for differing read depth between cells.

In [295]:
print('UMI normalization by median fragments count per barcode (RPM)') 
# (peaks x barcodes) format
size_factors = estimate_size_factors(coveragedf.values)
normdf = coveragedf / size_factors # per column, i.e. barcode
UMI normalization by median fragments count per barcode (RPM)

Feature selection (Identification of highly variable features)

Pipeline

  • mean >99% as alignment errors or housekeeping genes
  • std <1% as noise

Cell Ranger ATAC removed features selection with zero variance.

SnapATAC The vast majority of the items in the cell-by-bin count matrix is “0”, some items have abnormally high coverage (often > 200) perhaps due to alignment error. Bins of exceedingly high coverage which likely represent the genomic regions that are invariable between cells such as housekeeping gene promoters were removed. We noticed that filtering bins of extremely low coverage perhaps due to random noise can also improve the robustness of the downstream clustering analysis.

  • Calculated the coverage of each bin using the binary matrix and
  • Removed the top 0.1% items of the highest coverage
  • Normalized the coverage by log10(count + 1).
  • Log-scaled coverage obey approximately a gaussian distribution which is then converted into zscore.
  • Bins with zscore beyond ±2 were filtered before further analysis.

Signac The largely binary nature of scATAC-seq data makes it challenging to perform ‘variable’ feature selection, as we do for scRNA-seq. Instead, we can choose to use only the top n% of features (peaks) for dimensional reduction, or remove features present in less that n cells with the FindTopFeatures function. Here, we will all features, though we note that we see very similar results when using only a subset of features (try setting min.cutoff to ‘q75’ to use the top 25% all peaks), with faster runtimes. Features used for dimensional reduction are automatically set as VariableFeatures for the Seurat object by this function. See for explanation: https://satijalab.org/signac/articles/pbmc_vignette.html

Seurat calculate a subset of features that exhibit high cell-to-cell variation in the dataset (i.e, they are highly expressed in some cells, and lowly expressed in others). We and others have found that focusing on these genes in downstream analysis helps to highlight biological signal in single-cell datasets.

In [298]:
def feature_selection(df, stdp=1, meanp=99):
    # df is (peaks x barcodes)
    print('Computing mean and std values for peaks...')
    means = np.mean(df.values, axis=1)
    stds = np.std(df.values, axis=1)
    peaksdf = pd.DataFrame({'mean': means, 'std': stds})
    print('Feature selection: filter out most highly covered peaks as alignment errors or housekeeping genes.')
    means_high = np.percentile(means, meanp)
    peaks_filter = means < means_high
    print('Feature selection: filter out non-variable peaks.')          
    stds_low = np.percentile(stds, stdp)
    peaks_filter = np.logical_and(stds_low < stds, means < means_high)
    return peaks_filter, peaksdf
In [299]:
# Normalization to peak length to take into account signal strength, not total coverage
peaks_filter, peaksdf = feature_selection(normdf)

with PdfPages(os.path.join(figures_path, 'feature_selection_before.pdf')) as pdf:
    # df is (peaks x barcodes)
    sns.jointplot(x='mean', y='std', data=peaksdf)
    pdf.savefig()
    plt.show
print('Total peaks', len(normdf))
Computing mean and std values for peaks...
Feature selection: filter out most highly covered peaks as alignment errors or housekeeping genes.
Feature selection: filter out non-variable peaks.
Total peaks 123113
In [300]:
with PdfPages(os.path.join(figures_path, 'feature_selection_after.pdf')) as pdf:
    sns.jointplot(x='mean', y='std', data=pd.DataFrame({'mean': peaksdf.loc[peaks_filter]['mean'], 
                                                        'std': peaksdf.loc[peaks_filter]['std']}))
    pdf.savefig()
    plt.show
print('Filtered peaks', sum(peaks_filter))
Filtered peaks 120649

Check marker regions of interest

In [301]:
peaks = normdf.index[np.flatnonzero(peaks_filter)]
marker_regions_peaks = {}
for k, r in MARKERS_REGIONS.items():
    cr, sr, er = re.split(':|-', r)
    tp = [(c,s,e) for (c,s,e) in map(lambda p: re.split(':|-', p), peaks) 
        if c == cr and not (int(er) < int(s) or int(e) < int(sr))]
    print('Region', k, 'Peak', peak)
    if not tp:
        print(f'Not found {k}')
    if len(tp) != 1:
        print(f'Too many peaks for region {k}')
    cp, sp, ep = tp[0]
    marker_regions_peaks[k] = peak = f'{cp}:{sp}-{ep}'
Region Clec4a4 Peak chr2:26578100-26578972
Region Lysmd2 Peak chr6:122989632-122990644
Region Ube2e2 Peak chr9:75625037-75626606
Region H2-Eb2 Peak chr14:18892997-18894686
Region Nudt17 Peak chr17:34324822-34327737
Region Tnfrsf4 Peak chr3:96707280-96709496
Region Socs2 Peak chr4:156013571-156014106
Region Clec4a3 Peak chr10:95414191-95418810
Region Csf1r Peak chr6:122952067-122953232
Region Adgre4 Peak chr18:61105311-61109447
Region Cd300ld Peak chr17:55749678-55750322
Region CD24a Peak chr11:114989480-114990440
Region Xcr1 Peak chr10:43577329-43581813
Region Snx22 Peak chr9:123862947-123864098
Region Cxcl9 Peak chr9:66068727-66070382
Region Cd226 Peak chr5:92326968-92328785
Region Kcne3 Peak chr18:89176536-89177367
Region Clec10a Peak chr7:100178039-100179827
Region Selenop Peak chr11:70165508-70167125
Region Mki67 Peak chr15:3269991-3271953
Region Ube2c Peak chr7:135712307-135717421
Region Adgrg1 Peak chr2:164769037-164770680
Region Adgrg3 Peak chr8:94983231-94985652
Region Egfl7 Peak chr8:95017083-95017967

Dimensionality Reduction, Clustering and visualization

Pipeline Use Cell Ranger ATAC approach and IRLB code for dimensionality reduction.

LSA (IDF + IRLBA + L2 normalization)

Cell Ranger ATAC default method is LSA.

  1. IDF scaling
  2. IRLBA SVD decomposition to dim 15
  3. L2 normalization in low dimension projection
  4. Graph clustering
  5. T-SNE visualization

IRLBA: The augmented implicitly restarted Lanczos bidiagonalization algorithm finds a few approximate largest singular values and corresponding singular vectors of a sparse or dense matrix using a method of Baglama and Reichel.

PCA if your features are least sensitive (informative) towards the mean of the distribution, then it makes sense to subtract the mean. If the features are most sensitive towards the high values, then subtracting the mean does not make sense.

SVD does not subtract the means but often as a first step projects the data on the mean of all data points. In this way the SVD first takes care of global structure - this is important for IDF transformation.

We found that the combination of these normalization techniques obviates the need to remove the first component. The number of dimensions is fixed to 15 as it was found to sufficiently separate clusters visually and in a biologically meaningful way when tested on peripheral blood mononuclear cells (PBMCs).

In [303]:
def lsa(df, dim):
    print('Normalizing by IDF')
    idf = np.log1p(df.shape[1]) - np.log1p(np.count_nonzero(df, axis=1))
    df_idf = df.multiply(idf, axis=0).T # Transpose to (barcodes x peaks) format
    del idf # Cleanup    
    
    print('Performing IRLBA without scaling or centering')
    # Number of dimensions recommended by Cell Ranger ATAC-Seq algorithm
    (_, _, v, _, _) = irlb(df_idf, dim)
    # project the matrix to complete the transform: X --> X*v = u*d
    df_idf_irlb = df_idf.dot(v) 
    assert df_idf_irlb.shape == (df_idf.shape[0], dim), 'check dimensions'
    del df_idf # Cleanup

    # IMPORTANT! We found that the combination of these normalization techniques 
    # obviates the need to remove the first component.
    print('Normalization each barcode data point to unit L2-norm in the lower dimensional space')
    idf_irlb_l2 = Normalizer(norm='l2').fit_transform(df_idf_irlb) # (n_samples, n_features) 
    return pd.DataFrame(idf_irlb_l2, index=df_idf_irlb.index, columns=df_idf_irlb.columns)

# NO UMI normalization required for LSA approach!
# IDF is invariant to scales, IRLBA don't need scaling.
# Normalization to peak length to take into account signal strength, not total coverage
result_lsa = lsa(normdf.loc[peaks_filter], 15)
Normalizing by IDF
Performing IRLBA without scaling or centering
Normalization each barcode data point to unit L2-norm in the lower dimensional space
In [361]:
# Cleanup memory
del normdf
gc.collect()
# print_mem_usage()
Out[361]:
288138

TSNE visualization

In [304]:
print('Processing t-SNE on L2-normalized data')
tsne_coords = pd.DataFrame(TSNE(n_components=2).fit_transform(result_lsa.values),
                           index=result_lsa.index,
                           columns=['tsne1', 'tsne2'])
print('t-SNE done!')
# display(tsne_coords.head())

print('Saving tSNE coordinates to tsne.tsv')
tsne_coords.to_csv(os.path.join(OUTPUT_DIR, 'tsne.tsv'), sep='\t')
Processing t-SNE on L2-normalized data
t-SNE done!
Saving tSNE coordinates to tsne.tsv
In [305]:
print('tSNE on LSA normalized vs log10 coverate depth')
with PdfPages(os.path.join(figures_path, 'tsne.pdf')) as pdf:
    plot_colored(tsne_coords.values, np.log10(coveragedf.sum()), size=20)
    pdf.savefig()
tSNE on LSA normalized vs log10 coverate depth
In [306]:
print('tSNE on LSA normalized vs', FACTOR, FACTORS_MAP)
factors = np.array([FACTOR_FUNCTION(bc) for bc in result_lsa.index])
with PdfPages(os.path.join(figures_path, f'tsne_{FACTOR}.pdf')) as pdf:
    plot_colored(tsne_coords.values, factors, clusters=True, size=20, alpha=0.01)
    pdf.savefig()
tSNE on LSA normalized vs type {1: 'WT', 2: 'WT + IL-12', 3: 'BATF3 KO', 4: 'BATF3 KO + IL-12'}
In [307]:
print(FACTORS_MAP)
with PdfPages(os.path.join(figures_path, f'tsne_{FACTOR}_split.pdf')) as pdf:
    plot_colored_split(tsne_coords.values, factors, alpha=0.1)
    pdf.savefig()
{1: 'WT', 2: 'WT + IL-12', 3: 'BATF3 KO', 4: 'BATF3 KO + IL-12'}

Graph clustering of LSA normalized data

Clustering should:

  • produce clusters enough clusters 10-15 recommended by Cell Ranger
  • too small clusters are likely artifacts < 100 cells
  • centers of clusters should be inside the cluster, otherwise we should increase number of clusters OR increase lower dimension space size
In [308]:
def graph_clustering(result, n_clusters):
    # Ward method as default
    print('Clustering', n_clusters)
    clusters = AgglomerativeClustering(n_clusters=n_clusters).fit_predict(result).astype(int)
    cluster_counter = Counter()
    for c in clusters:
        cluster_counter[c] += 1

    # reorder clusters by size descending
    clusters_reord = np.zeros(len(clusters), dtype=int)    
    for i, (c, n) in enumerate(cluster_counter.most_common()):
        clusters_reord[clusters == c] = i

    return clusters_reord

# 20 clusters is default by Cell Ranger ATAC-Seq
n_clusters = 24
clusters = graph_clustering(result_lsa, n_clusters)
print(f'Saving clusters to clusters{n_clusters}.tsv')
clusters_df = pd.DataFrame({'cluster': clusters}, index=result_lsa.index)
clusters_df.to_csv(os.path.join(OUTPUT_DIR, f'clusters{n_clusters}.tsv'))
Clustering 24
Saving clusters to clusters24.tsv
In [309]:
print('QC clustermap of normalized LSA coordinates and clusters')
n = 100
print('Sampling', n, 'cells from each cluster')
ts = []
for cluster in tqdm(range(n_clusters)):
    t = result_lsa.loc[clusters == cluster].sample(n=n, axis=0, replace=True).copy()
    t['cluster'] = cluster / n_clusters # Normalize cluster number to 0-1
    ts.append(t)

with PdfPages(os.path.join(figures_path, 'cluster_lsa.pdf')) as pdf:
    sns.clustermap(pd.concat(ts), col_cluster=False, row_cluster=False, figsize=(8, 8))
    pdf.savefig()
QC clustermap of normalized LSA coordinates and clusters
Sampling 100 cells from each cluster

In [310]:
print('Computing summary for clusters by', FACTOR)
print(FACTORS_MAP)

t = pd.DataFrame({FACTOR: [FACTOR_FUNCTION(bc) for bc in coveragedf.columns],
                 'cluster': clusters, 'counts': np.ones(len(coveragedf.columns))}, 
                 index=coveragedf.columns)
display(t[[FACTOR, 'counts']].groupby([FACTOR]).sum())

display(t[['cluster', 'counts']].groupby(['cluster']).sum())

with PdfPages(os.path.join(figures_path, 'clusters_sizes.pdf')) as pdf:
    clusters_sizes(t, 'counts')
    pdf.savefig()

with PdfPages(os.path.join(figures_path, f'clusters_{FACTOR}.pdf')) as pdf:
    clusters_factors(t, 'counts', FACTORS_MAP)
    pdf.savefig() 
Computing summary for clusters by type
{1: 'WT', 2: 'WT + IL-12', 3: 'BATF3 KO', 4: 'BATF3 KO + IL-12'}
counts
type
1 8912.0
2 9957.0
3 5350.0
4 7641.0
counts
cluster
0 5825.0
1 2904.0
2 2574.0
3 2338.0
4 2246.0
5 1986.0
6 1863.0
7 1341.0
8 1317.0
9 1002.0
10 991.0
11 965.0
12 941.0
13 675.0
14 675.0
15 673.0
16 655.0
17 601.0
18 526.0
19 421.0
20 371.0
21 344.0
22 343.0
23 283.0
In [311]:
with PdfPages(os.path.join(figures_path, 'clusters_umi.pdf')) as pdf:
    ts = []
    plt.figure(figsize=(int(n_clusters * 0.75), 6))
    for c in tqdm(set(clusters)):
        t = pd.DataFrame({'umi': np.log10(coveragedf.T.iloc[np.flatnonzero(clusters == c)].sum(axis=1))})
        t['cluster'] = str(c)
        ts.append(t)
    sns.violinplot(data=pd.concat(ts), x='cluster', y='umi', order=[str(c) for c in sorted(set(clusters))])
    plt.title('UMI summary coverage per cell')
    plt.xlabel('Cluster')
    plt.ylabel('Summary log10 coverage')
    plt.legend()
    pdf.savefig()

No handles with labels found to put in legend.
In [312]:
with PdfPages(os.path.join(figures_path, 'clusters_cell_peaks.pdf')) as pdf:
    ts = []
    plt.figure(figsize=(int(n_clusters * 0.75), 6))    
    for c in tqdm(set(clusters)):
        t = pd.DataFrame({
            'nzpeaks': np.log10(np.count_nonzero(coveragedf.T.iloc[np.flatnonzero(clusters == c)], axis=1))
        })
        t['cluster'] = str(c)
        ts.append(t)
    sns.violinplot(data=pd.concat(ts), x='cluster', y='nzpeaks', order=[str(c) for c in sorted(set(clusters))])
    plt.title('Peaks with non-zero coverage distribution per cell')
    plt.xlabel('Cluster')
    plt.ylabel('Frequency')
    plt.legend()
    pdf.savefig()

No handles with labels found to put in legend.

Clusters visualization

In [313]:
print('tSNE on LSA normalized vs graph clusters')
with PdfPages(os.path.join(figures_path, 'tsne_clusters.pdf')) as pdf:
    plot_colored(tsne_coords.values, clusters, clusters=True, show_centers=True, size=20, alpha=0.5)
    pdf.savefig()
tSNE on LSA normalized vs graph clusters
In [314]:
with PdfPages(os.path.join(figures_path, 'tsne_clusters_split.pdf')) as pdf:
    plot_colored_split(tsne_coords.values, clusters, size=5)
    pdf.savefig()

Clusters visualization vs 10x Cell Ranger ATAC

In [315]:
# print('These clusters into Cell Ranger ATAC coordinates') 
mergeddf1 = pd.merge(pd.DataFrame({'Barcode': tsne_coords.index, 'Cluster': clusters}),
                     right=tsne10xdf, 
                     left_on='Barcode', right_on='Barcode')

mergeddf2 = pd.merge(pd.DataFrame({'Barcode': tsne_coords.index,
                                  'tsne1': tsne_coords['tsne1'],
                                  'tsne2': tsne_coords['tsne2']}),
                     right=clusters10xdf, 
                     left_on='Barcode', right_on='Barcode')


fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(18, 7))

ax1.title.set_text('Clusters in 10x tsne coordinates')
plot_colored(mergeddf1[['TSNE-1', 'TSNE-2']].values, 
             mergeddf1['Cluster'], 
             clusters=True,
             show_centers=True,
             size=20, alpha=0.3, fig_ax=(fig, ax1))

ax2.title.set_text('10x clusters in tsne coordinates')
plot_colored(mergeddf2[['tsne1', 'tsne2']].values, 
             mergeddf2['Cluster'], 
             clusters=True,
             show_centers=True,
             size=20, alpha=0.3, fig_ax=(fig, ax2))
plt.show()

UMAP visualization

In [316]:
# UMAP visualization
umap_coords = pd.DataFrame(umap.UMAP().fit_transform(result_lsa.values)[:, :2], 
                           index=result_lsa.index,
                           columns=['umap1', 'umap2'])

print('Saving UMAP coordinates to umap.tsv')
umap_coords.to_csv(os.path.join(OUTPUT_DIR, 'umap.tsv'), sep='\t')
/home/user/anaconda/envs/py37/lib/python3.6/site-packages/numba/typed_passes.py:271: NumbaPerformanceWarning: 
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../../../home/user/anaconda/envs/py37/lib/python3.6/site-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  state.func_ir.loc))
Saving UMAP coordinates to umap.tsv
In [317]:
print('UMAP on LSA normalized vs log10 coverate depth')
with PdfPages(os.path.join(figures_path, 'umap.pdf')) as pdf:
    plot_colored(umap_coords.values, np.log10(coveragedf.sum()), size=20)
    pdf.savefig()
UMAP on LSA normalized vs log10 coverate depth
In [318]:
print('tSNE on LSA normalized vs', FACTOR, FACTORS_MAP)
with PdfPages(os.path.join(figures_path, f'umap_{FACTOR}.pdf')) as pdf:
    factors = [FACTOR_FUNCTION(bc) for bc in result_lsa.index]
    plot_colored(umap_coords.values, factors, clusters=True, size=20, alpha=0.01)
    pdf.savefig()
tSNE on LSA normalized vs type {1: 'WT', 2: 'WT + IL-12', 3: 'BATF3 KO', 4: 'BATF3 KO + IL-12'}
In [319]:
print(FACTORS_MAP)
with PdfPages(os.path.join(figures_path, f'umap_{FACTOR}_spit.pdf')) as pdf:
    plot_colored_split(umap_coords.values, factors, size=5, alpha=0.1)
    pdf.savefig()
{1: 'WT', 2: 'WT + IL-12', 3: 'BATF3 KO', 4: 'BATF3 KO + IL-12'}

In [320]:
print('UMAP on LSA normalized vs graph clusters')
with PdfPages(os.path.join(figures_path, 'umap_clusters.pdf')) as pdf:
    plot_colored(umap_coords.values, clusters, clusters=True, show_centers=True, size=20, alpha=0.3)
    pdf.savefig()
UMAP on LSA normalized vs graph clusters
In [321]:
print('UMAP on LSA normalized vs graph clusters')
with PdfPages(os.path.join(figures_path, 'umap_clusters_split.pdf')) as pdf:
    plot_colored_split(umap_coords.values, clusters, size=5)
    pdf.savefig()
UMAP on LSA normalized vs graph clusters

Analysis markers of interest

In [322]:
def regions_plot(regions, df, xs, ys, xlabel, ylabel, factors, clusters, binary):
    print(f'Coverage z-score, distribution and fraction on non-zero covered peaks w.r.t {FACTOR} and clusters')
    
    fig = plt.figure(figsize=(4*4, 4*len(regions)))
    grid = plt.GridSpec(nrows=len(regions), ncols=8, wspace=0.5, hspace=0.5, figure=fig)
    for i, (k, peak) in enumerate(regions.items()):
        vals = df.loc[peak]
        print(k, peak, 'cov>0:', sum(vals > 0), 'of', len(vals), 'max cov:', max(vals))

        ax = plt.subplot(grid[i, 0:2])
        # Plot zscores scatter plot with 2 segments colormap blue-white-red
        zs = zscore(vals)
        cmap = LinearSegmentedColormap.from_list(name='zscore', 
            colors=[(0, 'darkblue'), ((0 - np.min(zs)) / (np.max(zs) - np.min(zs)), 'white'), (1, 'red')])
        plt.setp(ax, xticks=[], yticks=[])
        ax.scatter(xs, ys, marker='.', c=zs, cmap=cmap, s=2, alpha=0.2)
        plt.title(k)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)        
        
        t = pd.DataFrame({'cluster': clusters, 'factor': factors, 
                          'val': [min(v, 1) for v in vals] if binary else vals})
        
        
        ax = plt.subplot(grid[i, 2])
        # mean non-zero covered peaks by type
        tf = pd.pivot_table(t, values='val', index=['factor'], aggfunc='mean').reset_index()
        sns.barplot(x='factor', y='val', data=tf)
        ax.tick_params(axis='x', labelrotation=90)
        plt.ylabel('')

        ax = plt.subplot(grid[i, 3:])
        # mean non-zero covered peaks by cluster and factor
        tc = t.groupby(['cluster', 'factor']).mean().reset_index()
        sns.barplot(x='cluster', y='val', data=tc, hue='factor')
        plt.ylabel('')
In [323]:
print('Analysing REGIONS of interest TSNE')
with PdfPages(os.path.join(figures_path, 'tsne_markers.pdf')) as pdf:
    regions_plot(marker_regions_peaks, coveragedf, 
                 tsne_coords['tsne1'], tsne_coords['tsne2'],
                 'tsne1', 'tsne2',
                [FACTORS_MAP[f] for f in factors], clusters, True)
    pdf.savefig()
Analysing REGIONS of interest TSNE
Coverage z-score, distribution and fraction on non-zero covered peaks w.r.t type and clusters
Clec4a4 chr6:122989632-122990644 cov>0: 1329 of 31860 max cov: 3
Lysmd2 chr9:75625037-75626606 cov>0: 1309 of 31860 max cov: 5
Ube2e2 chr14:18892997-18894686 cov>0: 1830 of 31860 max cov: 5
H2-Eb2 chr17:34324822-34327737 cov>0: 2742 of 31860 max cov: 7
Nudt17 chr3:96707280-96709496 cov>0: 1966 of 31860 max cov: 5
Tnfrsf4 chr4:156013571-156014106 cov>0: 248 of 31860 max cov: 4
Socs2 chr10:95414191-95418810 cov>0: 4453 of 31860 max cov: 7
Clec4a3 chr6:122952067-122953232 cov>0: 2031 of 31860 max cov: 4
Csf1r chr18:61105311-61109447 cov>0: 5478 of 31860 max cov: 9
Adgre4 chr17:55749678-55750322 cov>0: 748 of 31860 max cov: 3
Cd300ld chr11:114989480-114990440 cov>0: 1132 of 31860 max cov: 5
CD24a chr10:43577329-43581813 cov>0: 3891 of 31860 max cov: 10
Xcr1 chr9:123862947-123864098 cov>0: 1386 of 31860 max cov: 3
Snx22 chr9:66068727-66070382 cov>0: 1831 of 31860 max cov: 5
Cxcl9 chr5:92326968-92328785 cov>0: 1002 of 31860 max cov: 5
Cd226 chr18:89176536-89177367 cov>0: 669 of 31860 max cov: 3
Kcne3 chr7:100178039-100179827 cov>0: 1490 of 31860 max cov: 5
Clec10a chr11:70165508-70167125 cov>0: 1975 of 31860 max cov: 5
Selenop chr15:3269991-3271953 cov>0: 2394 of 31860 max cov: 4
Mki67 chr7:135712307-135717421 cov>0: 5873 of 31860 max cov: 7
Ube2c chr2:164769037-164770680 cov>0: 5966 of 31860 max cov: 5
Adgrg1 chr8:94983231-94985652 cov>0: 1643 of 31860 max cov: 5
Adgrg3 chr8:95017083-95017967 cov>0: 485 of 31860 max cov: 4
Egfl7 chr2:26578100-26578972 cov>0: 545 of 31860 max cov: 3
In [324]:
print('Analysing REGIONS of interest UMAP')
with PdfPages(os.path.join(figures_path, 'umap_markers.pdf')) as pdf:
    regions_plot(marker_regions_peaks, coveragedf, 
                 umap_coords['umap1'], umap_coords['umap2'],
                 'umap1', 'umap2',
                 [FACTORS_MAP[f] for f in factors], clusters, True)
    pdf.savefig()
Analysing REGIONS of interest UMAP
Coverage z-score, distribution and fraction on non-zero covered peaks w.r.t type and clusters
Clec4a4 chr6:122989632-122990644 cov>0: 1329 of 31860 max cov: 3
Lysmd2 chr9:75625037-75626606 cov>0: 1309 of 31860 max cov: 5
Ube2e2 chr14:18892997-18894686 cov>0: 1830 of 31860 max cov: 5
H2-Eb2 chr17:34324822-34327737 cov>0: 2742 of 31860 max cov: 7
Nudt17 chr3:96707280-96709496 cov>0: 1966 of 31860 max cov: 5
Tnfrsf4 chr4:156013571-156014106 cov>0: 248 of 31860 max cov: 4
Socs2 chr10:95414191-95418810 cov>0: 4453 of 31860 max cov: 7
Clec4a3 chr6:122952067-122953232 cov>0: 2031 of 31860 max cov: 4
Csf1r chr18:61105311-61109447 cov>0: 5478 of 31860 max cov: 9
Adgre4 chr17:55749678-55750322 cov>0: 748 of 31860 max cov: 3
Cd300ld chr11:114989480-114990440 cov>0: 1132 of 31860 max cov: 5
CD24a chr10:43577329-43581813 cov>0: 3891 of 31860 max cov: 10
Xcr1 chr9:123862947-123864098 cov>0: 1386 of 31860 max cov: 3
Snx22 chr9:66068727-66070382 cov>0: 1831 of 31860 max cov: 5
Cxcl9 chr5:92326968-92328785 cov>0: 1002 of 31860 max cov: 5
Cd226 chr18:89176536-89177367 cov>0: 669 of 31860 max cov: 3
Kcne3 chr7:100178039-100179827 cov>0: 1490 of 31860 max cov: 5
Clec10a chr11:70165508-70167125 cov>0: 1975 of 31860 max cov: 5
Selenop chr15:3269991-3271953 cov>0: 2394 of 31860 max cov: 4
Mki67 chr7:135712307-135717421 cov>0: 5873 of 31860 max cov: 7
Ube2c chr2:164769037-164770680 cov>0: 5966 of 31860 max cov: 5
Adgrg1 chr8:94983231-94985652 cov>0: 1643 of 31860 max cov: 5
Adgrg3 chr8:95017083-95017967 cov>0: 485 of 31860 max cov: 4
Egfl7 chr2:26578100-26578972 cov>0: 545 of 31860 max cov: 3

BigWig RPM normalized profiles

Pipeline Split UMI reads by clusters and use RPKM normalization.


Seurat We created pseudo-bulk ATAC-seq profiles by pooling together cells with for each cell type. Each cell type showed enriched accessibility near canonical marker genes. Chromatin accessibility tracks are normalized to sequencing depth (RPKM normalization) in each pooled group.

In [330]:
def write_bigwigs(barcodes, clusters, bigwigs_path, step):
    print('Loading full fragments dataframe')
    fdf = pd.read_csv(fragments_file, sep='\t', names=['chr', 'start', 'end', 'barcode', 'reads'])

    print('Aggregating fragments and clusters')
    t = pd.DataFrame({'barcode': barcodes, 'cluster': clusters})
    df_for_bigwigs = pd.merge(left=fdf, right=t,
                              left_on='barcode', right_on='barcode')

    for cluster in sorted(set(clusters)):
        bw_path = os.path.join(bigwigs_path, f'cluster_{cluster}.bw')
        print('Processing', cluster, bw_path)
        df_for_bigwigs_cluster = df_for_bigwigs.loc[df_for_bigwigs['cluster'] == cluster]
        chr_lengths = df_for_bigwigs_cluster[['chr', 'end']].groupby('chr').max().reset_index()

        with pyBigWig.open(bw_path, 'w') as bw:
            bw.addHeader(list(zip(chr_lengths['chr'], chr_lengths['end'])))
            for chromosome in tqdm(chr_lengths['chr']):
                df_for_bigwigs_cluster_chr =\
                    df_for_bigwigs_cluster.loc[df_for_bigwigs_cluster['chr'] == chromosome].sort_values(['start', 'end'])

                starts = list(df_for_bigwigs_cluster_chr['start'])
                ends = list(df_for_bigwigs_cluster_chr['end'])
                reads = list(df_for_bigwigs_cluster_chr['reads'])
                chr_length = int(chr_lengths.loc[chr_lengths['chr'] == chromosome]['end'])
                values = np.zeros(int(math.floor(chr_length / step)) + 1)

                for i in range(len(df_for_bigwigs_cluster_chr)):
                    # Ignore PCR duplicates here!
                    # v = reads[i] 
                    v = 1.0
                    si = int(math.floor(starts[i] / step)) 
                    ei = int(math.floor(ends[i] / step)) 
                    if ei == si:
                        values[si] += v
                    else:
                        values[si] += v / 2 
                        values[ei] += v / 2

                non_zero = values > 0
                non_zero_inds = np.flatnonzero(non_zero)
                starts_np = non_zero_inds * step
                ends_np = (non_zero_inds + 1) * step
                values_np = values[non_zero]
                # RPM normalization
                values_np = values_np * (1_000_000.0 / sum(values_np))
                chroms_np = np.array([chromosome] * sum(non_zero))
                bw.addEntries(chroms_np, starts=starts_np, ends=ends_np, values=values_np)                     

    print('Done')
In [331]:
step = 200
bigwigs_path = os.path.join(OUTPUT_DIR, 'bigwig')
! mkdir -p {bigwigs_path}

write_bigwigs(clusters_df.index, clusters, bigwigs_path, step)
Loading full fragments dataframe
Aggregating fragments and clusters
Processing 0 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_0.bw
Processing 1 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_1.bw
Processing 2 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_2.bw
Processing 3 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_3.bw
Processing 4 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_4.bw
Processing 5 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_5.bw
Processing 6 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_6.bw
Processing 7 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_7.bw
Processing 8 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_8.bw
Processing 9 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_9.bw
Processing 10 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_10.bw
Processing 11 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_11.bw
Processing 12 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_12.bw
Processing 13 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_13.bw
Processing 14 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_14.bw
Processing 15 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_15.bw
Processing 16 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_16.bw
Processing 17 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_17.bw
Processing 18 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_18.bw
Processing 19 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_19.bw
Processing 20 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_20.bw
Processing 21 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_21.bw
Processing 22 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_22.bw
Processing 23 /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/bigwig/cluster_23.bw
Done

Clusters full datasheet

Closest genes for peaks annotation

In [333]:
print('Preprocess GTF file')

gtf_file = GTF_FILE
transcripts_bed = gtf_file + '.bed'
if not os.path.exists(transcripts_bed):
    GTF_TO_BED_SH = """
TNF=$(cat $1 | grep transcript_id | grep -v '#' | head -n 1 | awk '{for (i=4; i<NF; i++) {if ($3=="transcript" && $i=="gene_name") print (i+1)}}')
cat $1 | awk -v TNF=$TNF -v OFS="\t" '{if ($3=="transcript") {print $1,$4-1,$5,$TNF,0,$7}}' | tr -d '";' | sort -k1,1 -k2,2n -k3,3n --unique > $2
    """
    with tempfile.NamedTemporaryFile(prefix='gtf_to_bed', suffix='.sh', delete=False) as f:
        f.write(GTF_TO_BED_SH.encode('utf-8'))
        f.close()
        ! bash {f.name} {gtf_file} {transcripts_bed} 
print('Transcripts bed file', transcripts_bed)

transcripts_tss = gtf_file + '.tss'
if not os.path.exists(transcripts_tss):
    BED_TO_TSS_SH = """
cat $1 | awk -v OFS="\t" '{if ($6=="+") {print $1, $2, $2+1, $4, 0, "+"} else {print $1,$3-1,$3, $4, 0, "-"}}' | sort -k1,1 -k2,2n -k3,3n --unique > $2
    """
    with tempfile.NamedTemporaryFile(prefix='bed_to_tss', suffix='.sh', delete=False) as f:
        f.write(BED_TO_TSS_SH.encode('utf-8'))
        f.close()
        ! bash {f.name} {transcripts_bed} {transcripts_tss} 
print('Transcripts tss file', transcripts_tss)
Preprocess GTF file
Transcripts bed file /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/gencode.vM25.annotation.gtf.bed
Transcripts tss file /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/gencode.vM25.annotation.gtf.tss

Pipeline Preprocess GENES to TSS and use bedtools closest -D b to report distance to TSS, upstream or downstream w.r.t. the gene strand.


Cell Ranger ATAC We use bedtools closest -D b to associate each peak with genes based on closest transcription start sites (packaged within the reference) such that the peak is within 1000 bases upstream or 100 bases downstream of the TSS.

In [334]:
def locations_closest_genes(locations, transcripts_tss):
    print('Annotate list of locations in format "chr:start-end" with closest tss of genes')
    with tempfile.NamedTemporaryFile(delete=False) as f:
        # Save index to restore original order after sorting
        for i, t in enumerate([re.split(':|-', p) for p in locations]):
            f.write('{}\t{}\t{}\t{}\n'.format(*t, i).encode('utf-8'))
        f.close()
        closest_file = f.name + '.closest'
        sorted_file = f.name + '.sorted'
        ! sort -k1,1 -k2,2n {f.name} > {sorted_file}
        ! bedtools closest -a {sorted_file} -b {transcripts_tss} -D b > {closest_file}
        closest_df = pd.read_csv(
            closest_file, sep='\t', 
            names=['chr', 'start', 'end', 'index', 
                   'gene_chr', 'gene_start', 'gene_end', 'gene', 'gene_score', 'gene_strand', 'distance'])
        # Restore original sorting as 
        closest_df.sort_values(by=['index'], inplace=True)
        # Pick only first closest gene in case of several
        closest_df.drop_duplicates(['index'], inplace=True)
        return closest_df[['chr', 'start', 'end', 'gene', 'distance']]


closest_genes = locations_closest_genes(list(coveragedf.index), transcripts_tss)
# Restore 'peak' to be able to merge on it
closest_genes['peak'] = closest_genes['chr'] +\
                        ':' + closest_genes['start'].astype(str) +\
                        '-' + closest_genes['end'].astype(str)
display(closest_genes.head())
Annotate list of locations in format "chr:start-end" with closest tss of genes
chr start end gene distance peak
17772 chr10 100015555 100016699 Kitl 0 chr10:100015555-100016699
17777 chr10 100138583 100138921 Gm22918 -580 chr10:100138583-100138921
17778 chr10 100148182 100148457 Gm25287 3929 chr10:100148182-100148457
17779 chr10 100160010 100160862 Gm25287 15757 chr10:100160010-100160862
17780 chr10 100171126 100171134 Gm25287 26873 chr10:100171126-100171134

Save cluster mean values

In [335]:
# Transpose to (barcodes x peaks) format
t = coveragedf.T 

# Per cluster mean values for peaks
clusters_means = {str(c): t.loc[clusters == c].mean() for c in set(clusters)}
clusters_means_df = pd.DataFrame(clusters_means).reset_index()[[str(c) for c in set(clusters)]]
clusters_means_genes_df = pd.concat([closest_genes.reset_index(), clusters_means_df], axis=1).drop(columns=['index'])
# Save peak without dashes or colons
clusters_means_genes_df['peak'] = [re.sub('[:-]', '_', p) for p in clusters_means_genes_df['peak']]

print('Saving clusters peaks_means to clusters_peaks_values.tsv')
display(clusters_means_genes_df.head())
clusters_means_genes_df.to_csv(os.path.join(OUTPUT_DIR, 'clusters_peaks_values.tsv'), sep='\t', index=False)
Saving clusters peaks_means to clusters_peaks_values.tsv
chr start end gene distance peak 0 1 2 3 ... 14 15 16 17 18 19 20 21 22 23
0 chr10 100015555 100016699 Kitl 0 chr10_100015555_100016699 0.004074 0.008867 0.003239 0.006009 ... 0.002300 0.010976 0.023020 0.006335 0.036935 0.031265 0.046463 0.01628 0.014317 0.080084
1 chr10 100138583 100138921 Gm22918 -580 chr10_100138583_100138921 0.001092 0.007065 0.000000 0.006254 ... 0.000883 0.001715 0.018205 0.003715 0.001147 0.001262 0.005368 0.00000 0.000000 0.004664
2 chr10 100148182 100148457 Gm25287 3929 chr10_100148182_100148457 0.001751 0.005416 0.000336 0.004071 ... 0.000710 0.001510 0.002136 0.001087 0.025741 0.006323 0.000000 0.00000 0.000000 0.001157
3 chr10 100160010 100160862 Gm25287 15757 chr10_100160010_100160862 0.013622 0.023060 0.011224 0.014149 ... 0.001029 0.006745 0.018364 0.006480 0.013917 0.004136 0.024967 0.00000 0.004935 0.039383
4 chr10 100171126 100171134 Gm25287 26873 chr10_100171126_100171134 0.000249 0.001126 0.000000 0.000521 ... 0.000000 0.000534 0.000000 0.000930 0.001733 0.000906 0.006751 0.00000 0.000000 0.000000

5 rows × 30 columns

In [336]:
print('Hierarchical clustering of clusters mean values')
with PdfPages(os.path.join(figures_path, 'clusters_hclust.pdf')) as pdf:
    sns.clustermap(clusters_means_genes_df[[str(i) for i in set(clusters)]], 
                   col_cluster=True, row_cluster=False, figsize=(10, 5), cmap=plt.cm.viridis)
    pdf.savefig()
Hierarchical clustering of clusters mean values

Differential peaks analysis (Markers)

Pipeline

Use Cell Ranger ATAC to get differential markers.


Cell Ranger ATAC For each peak we perform test cluster vs others, so that several clusters may yield significant results vs others, we take into account only the cluster with greatest mean value.\ Test is is perfromed using Cell Ranger default testing procedure based on Negative Binomial GLM.

In [339]:
# Launch 10x Genomics Cell Ranger DE
def run_10x_differential_expression(df, clusters, fdr=0.05):
    """ Compute differential expression for each cluster vs all other cells
        Args: df          - feature expression data (peak x barcode)
              clusters    - 0-based cluster labels
              fdr         - false discovery rate"""

    print("Computing params...")
    m = df.values
    sseq_params = compute_sseq_params(m)
    peaks = list(df.index)    

    cluster_markers = []
    for cluster in tqdm(sorted(set(clusters))):
        in_cluster = clusters == cluster
        group_a = np.flatnonzero(in_cluster)
        group_b = np.flatnonzero(np.logical_not(in_cluster))
        print(f'Computing DE for cluster {cluster}...')

        de_result = sseq_differential_expression(m, group_a, group_b, sseq_params)
        de_result['cluster'] = [cluster] * len(de_result)
        de_result['peak'] = peaks
        passed_fdr = de_result['adjusted_p_value'] < fdr
        passed_log2_fc = de_result['log2_fold_change'] > 0
        # Filter out different by statistical tests with log2 fold change > 0
        passed = np.logical_and(passed_fdr, passed_log2_fc)
        print('DE:', sum(passed), 'of', len(de_result), 
              'FDR:', sum(passed_fdr), 'log2fc>0:', sum(passed_log2_fc))

        passed_de = de_result.loc[passed]
        cluster_de_markers = passed_de[
            ['cluster', 'peak', 'norm_mean_a', 'norm_mean_b', 'log2_fold_change', 'p_value', 'adjusted_p_value']
        ].rename(columns={'norm_mean_a': 'norm_mean_cluster', 'norm_mean_b': 'norm_mean_others'}) 

        cluster_markers.append(cluster_de_markers)    

    return pd.concat(cluster_markers)
In [340]:
# Cell Ranger ATAC like differential analysis
# Each peak is tested independently, no normalization to peak length required.
markers = run_10x_differential_expression(coveragedf.loc[peaks_filter], clusters)
print('Total markers', len(markers))
Computing params...
Computing DE for cluster 0...
Computing 114536 exact tests and 6113 asymptotic tests.
DE: 28446 of 120649 FDR: 80620 log2fc>0: 46075
Computing DE for cluster 1...
Computing 119724 exact tests and 925 asymptotic tests.
DE: 25383 of 120649 FDR: 55643 log2fc>0: 56043
Computing DE for cluster 2...
Computing 120528 exact tests and 121 asymptotic tests.
DE: 15168 of 120649 FDR: 59370 log2fc>0: 36828
Computing DE for cluster 3...
Computing 120242 exact tests and 407 asymptotic tests.
DE: 28932 of 120649 FDR: 70916 log2fc>0: 49670
Computing DE for cluster 4...
Computing 120468 exact tests and 181 asymptotic tests.
DE: 24212 of 120649 FDR: 79489 log2fc>0: 41983
Computing DE for cluster 5...
Computing 120648 exact tests and 1 asymptotic tests.
DE: 18827 of 120649 FDR: 38535 log2fc>0: 53282
Computing DE for cluster 6...
Computing 120613 exact tests and 36 asymptotic tests.
DE: 16667 of 120649 FDR: 49854 log2fc>0: 45436
Computing DE for cluster 7...
Computing 120649 exact tests and 0 asymptotic tests.
DE: 15701 of 120649 FDR: 45716 log2fc>0: 39251
Computing DE for cluster 8...
Computing 119995 exact tests and 654 asymptotic tests.
DE: 16704 of 120649 FDR: 72707 log2fc>0: 31261
Computing DE for cluster 9...
Computing 120649 exact tests and 0 asymptotic tests.
DE: 20453 of 120649 FDR: 35206 log2fc>0: 56740
Computing DE for cluster 10...
Computing 120637 exact tests and 12 asymptotic tests.
DE: 25743 of 120649 FDR: 59637 log2fc>0: 52247
Computing DE for cluster 11...
Computing 120649 exact tests and 0 asymptotic tests.
DE: 26797 of 120649 FDR: 52651 log2fc>0: 56633
Computing DE for cluster 12...
Computing 120649 exact tests and 0 asymptotic tests.
DE: 16532 of 120649 FDR: 30448 log2fc>0: 56258
Computing DE for cluster 13...
Computing 120649 exact tests and 0 asymptotic tests.
DE: 1803 of 120649 FDR: 3931 log2fc>0: 64635
Computing DE for cluster 14...
Computing 120618 exact tests and 31 asymptotic tests.
DE: 15916 of 120649 FDR: 55867 log2fc>0: 38883
Computing DE for cluster 15...
Computing 120647 exact tests and 2 asymptotic tests.
DE: 3665 of 120649 FDR: 7024 log2fc>0: 48872
Computing DE for cluster 16...
Computing 120649 exact tests and 0 asymptotic tests.
DE: 4281 of 120649 FDR: 5519 log2fc>0: 68372
Computing DE for cluster 17...
Computing 120649 exact tests and 0 asymptotic tests.
DE: 5834 of 120649 FDR: 9142 log2fc>0: 55468
Computing DE for cluster 18...
Computing 120649 exact tests and 0 asymptotic tests.
DE: 6262 of 120649 FDR: 6665 log2fc>0: 84930
Computing DE for cluster 19...
Computing 120649 exact tests and 0 asymptotic tests.
DE: 8358 of 120649 FDR: 10985 log2fc>0: 62346
Computing DE for cluster 20...
Computing 120649 exact tests and 0 asymptotic tests.
DE: 1976 of 120649 FDR: 1978 log2fc>0: 92659
Computing DE for cluster 21...
Computing 120649 exact tests and 0 asymptotic tests.
DE: 11588 of 120649 FDR: 29078 log2fc>0: 48350
Computing DE for cluster 22...
Computing 120649 exact tests and 0 asymptotic tests.
DE: 7313 of 120649 FDR: 16124 log2fc>0: 51803
Computing DE for cluster 23...
Computing 120649 exact tests and 0 asymptotic tests.
DE: 6749 of 120649 FDR: 7413 log2fc>0: 79896

Total markers 353310

Markers summary

In [341]:
print('Number of differential markers per cluster')
t = markers[['cluster']].copy()
t['count'] = 1
display(pd.pivot_table(t, values='count', index=['cluster'], aggfunc=np.sum))
Number of differential markers per cluster
count
cluster
0 28446
1 25383
2 15168
3 28932
4 24212
5 18827
6 16667
7 15701
8 16704
9 20453
10 25743
11 26797
12 16532
13 1803
14 15916
15 3665
16 4281
17 5834
18 6262
19 8358
20 1976
21 11588
22 7313
23 6749

Annotate markers with genes

In [342]:
markers_with_genes = pd.merge(left=markers[['cluster', 'norm_mean_cluster', 'norm_mean_others', 
                                            'log2_fold_change', 'p_value', 'adjusted_p_value', 'peak']],
                         right=closest_genes,
                         left_on='peak',
                         right_on='peak')

# Save peak without dashes or colons
markers_with_genes['peak'] = [re.sub('[:-]', '_', p) for p in markers_with_genes['peak']]

# rearrange columns, sort
markers_with_genes = markers_with_genes[['chr', 'start', 'end', 'peak',
                                         'cluster', 'norm_mean_cluster', 'norm_mean_others', 
                                         'log2_fold_change', 'p_value', 'adjusted_p_value', 
                                         'gene', 'distance']].sort_values(by=['cluster', 'adjusted_p_value'])
# display(markers_with_genes.head())

print('Saving all the markers', len(markers_with_genes), 'to markers.tsv')
markers_with_genes.to_csv(os.path.join(OUTPUT_DIR, 'markers.tsv'), sep='\t', index=None)
Saving all the markers 353310 to markers.tsv

Analyzing top markers

In [343]:
def get_top_markers(df, n_clusters, top=100):
    top_markers = pd.concat([
        df.loc[df['cluster'] == c].sort_values(by=['adjusted_p_value']).head(top) 
                    for c in range(0, n_clusters)])
    top_markers['peak'] = top_markers['chr'] +\
                          ':' + top_markers['start'].astype(str) + '-'\
                          + top_markers['end'].astype(str)
    return top_markers
In [344]:
print('Computing Z scores for mean values in cluster for top markers')
top_markers = get_top_markers(markers_with_genes, 15, top=100)
t = coveragedf.T # Transpose to (barcodes x peaks) format
t = t[list(top_markers['peak'])] # top markers only

t = pd.concat([pd.DataFrame(t.loc[clusters == cluster].mean()).T for cluster in set(clusters)])
t.index = range(n_clusters)

# Transform to Z-score
for c in t.columns:
    t[c] = zscore(t[c])

print('Clustermap of Z scores')
with PdfPages(os.path.join(figures_path, 'markers_z.pdf')) as pdf:
    sns.clustermap(t.T, col_cluster=False, row_cluster=False, figsize=(10, 10), cmap=plt.cm.seismic)
    pdf.savefig()
Computing Z scores for mean values in cluster for top markers
Clustermap of Z scores

Top markers visualization on t-SNE

In [345]:
def show_top_markers_z(t, xs, ys, markers_with_genes, clusters, n_clusters, top=100):
    top_markers = get_top_markers(markers_with_genes, n_clusters, top)
    nrows, ncols = n_rows_columns(n_clusters)
    plt.subplots(nrows=nrows, ncols=ncols, figsize=(5 * ncols, 4 * nrows))
    for i, cluster in enumerate(tqdm(set(clusters))):
        # Table (marker for cluster x cells)
        markers_df = t.loc[top_markers.loc[top_markers['cluster'] == cluster]['peak']].T.copy()

        # Z scale for each marker peak across all cells
        for c in markers_df.columns:
            markers_df[c] = zscore(markers_df[c])

        # Average Z score for each cell
        zs = markers_df.T.mean()
        ax = plt.subplot(nrows, ncols, i + 1)
        # Average is much more stable than coverage, we can use linear colormap here
        plt.setp(ax, xticks=[], yticks=[])    
        # Plot zscores scatter plot with 2 segments colormap blue-white-red
        cmap = LinearSegmentedColormap.from_list(name='zscore', 
            colors=[(0, 'darkblue'), ((0 - np.min(zs)) / (np.max(zs) - np.min(zs)), 'white'), (1, 'red')])        
        sc = ax.scatter(xs, ys, s=5, c=zs, cmap=cmap, edgecolor='none', alpha=1)
#         sc = ax.scatter(xs, ys, s=5, c=zs, cmap=plt.cm.seismic, edgecolor='none', alpha=1)
        plt.colorbar(sc)
        plt.title(f'Cluster {cluster}')
#             plt.xlabel('tSNE axis 1')
#             plt.ylabel('tSNE axis 2')
In [346]:
print('t-SNE based Z-SCORE visualizations for markers, mean z-score for 100 top markers')
    
with PdfPages(os.path.join(figures_path, f'tsne_diff_markers_100.pdf')) as pdf:    
    show_top_markers_z(coveragedf, tsne_coords['tsne1'], tsne_coords['tsne2'], 
                       markers_with_genes, clusters, n_clusters, 100)
    pdf.savefig()
t-SNE based Z-SCORE visualizations for markers, mean z-score for 100 top markers

In [348]:
print('t-SNE based Z-SCORE visualizations for markers, z-score for 1 top marker')
t = coveragedf
top_markers_1 = get_top_markers(markers_with_genes, n_clusters, top=1)
top_markers_map = {}
for i, row in top_markers_1.iterrows():
    top_markers_map[str(row['cluster'])] = row['peak']
    
with PdfPages(os.path.join(figures_path, 'tsne_diff_markers_1.pdf')) as pdf:
    regions_plot(top_markers_map, coveragedf, 
                 tsne_coords['tsne1'], tsne_coords['tsne2'],
                 'tsne1', 'tsne2',
                 factors, clusters, True)
    pdf.savefig()
t-SNE based Z-SCORE visualizations for markers, z-score for 1 top marker
Coverage z-score, distribution and fraction on non-zero covered peaks w.r.t type and clusters
0 chr8:123749906-123750835 cov>0: 1876 of 31860 max cov: 5
1 chr10:93446017-93448149 cov>0: 1845 of 31860 max cov: 5
2 chr6:134802353-134803276 cov>0: 1562 of 31860 max cov: 7
3 chr13:36463486-36464386 cov>0: 771 of 31860 max cov: 4
4 chr3:103384684-103387169 cov>0: 1958 of 31860 max cov: 7
5 chr11:46795550-46796334 cov>0: 1022 of 31860 max cov: 4
6 chr11:105988609-105991187 cov>0: 3148 of 31860 max cov: 7
7 chr18:56446797-56448255 cov>0: 1264 of 31860 max cov: 6
8 chr10:20090419-20091390 cov>0: 916 of 31860 max cov: 6
9 chr3:79544019-79544477 cov>0: 188 of 31860 max cov: 3
10 chr5:149648289-149649821 cov>0: 548 of 31860 max cov: 9
11 chr10:89791698-89792455 cov>0: 572 of 31860 max cov: 4
12 chr17:83616043-83616909 cov>0: 982 of 31860 max cov: 4
13 chr19:43511591-43512642 cov>0: 1684 of 31860 max cov: 5
14 chr12:35597218-35598814 cov>0: 583 of 31860 max cov: 6
15 chr9:57325773-57326627 cov>0: 860 of 31860 max cov: 5
16 chr16:96387597-96388655 cov>0: 804 of 31860 max cov: 4
17 chr15:78493034-78495727 cov>0: 2033 of 31860 max cov: 10
18 chr4:3080356-3084467 cov>0: 1026 of 31860 max cov: 9
19 chr2:168500726-168501284 cov>0: 224 of 31860 max cov: 4
20 chr4:120223514-120224273 cov>0: 374 of 31860 max cov: 4
21 chr10:95914467-95914961 cov>0: 182 of 31860 max cov: 4
22 chr2:84964004-84964673 cov>0: 236 of 31860 max cov: 4
23 chr6:26766361-26767225 cov>0: 226 of 31860 max cov: 2

Save differential markers to BED

In [364]:
cluster_peaks_path = os.path.join(OUTPUT_DIR, 'markers')
! mkdir -p {cluster_peaks_path}
print('Cleanup peaks_clusters', cluster_peaks_path)
for f in glob.glob(f'{cluster_peaks_path}/markers_*.bed'):
    os.remove(f)

mlap_max = (-np.log10(markers.loc[markers['adjusted_p_value'] != 0]['adjusted_p_value'])).max()
print('BED scoring as -log10 adjusted pval')
print('Max of -log10 adjusted pval', mlap_max)

for c in set(clusters):
    bed_file = os.path.join(cluster_peaks_path, f'markers_{c}.bed')
    markers_cluster = markers.loc[markers['cluster'] == c].sort_values(
        ['log2_fold_change'], ascending=False).iloc[::-1]
    markers_cluster.index = range(len(markers_cluster))
    print('Saving cluster', c, 'marker peaks to', bed_file)
    with open(bed_file, 'w') as bed:
        for i in range(len(markers_cluster)):
            peak = markers_cluster['peak'][i]
            ap = markers_cluster['adjusted_p_value'][i]
            mlap = -np.log10(ap) if ap != 0.0 else mlap_max
            line = '{}\t{}\t{}\t.\n'.format(
                re.sub(':|-', '\t', peak), 
                c,
                mlap)
            bed.write(line)
print('Done')                
Cleanup peaks_clusters /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers
BED scoring as -log10 adjusted pval
Max of -log10 adjusted pval 317.2185311557059
Saving cluster 0 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_0.bed
Saving cluster 1 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_1.bed
Saving cluster 2 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_2.bed
Saving cluster 3 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_3.bed
Saving cluster 4 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_4.bed
Saving cluster 5 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_5.bed
Saving cluster 6 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_6.bed
Saving cluster 7 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_7.bed
Saving cluster 8 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_8.bed
Saving cluster 9 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_9.bed
Saving cluster 10 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_10.bed
Saving cluster 11 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_11.bed
Saving cluster 12 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_12.bed
Saving cluster 13 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_13.bed
Saving cluster 14 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_14.bed
Saving cluster 15 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_15.bed
Saving cluster 16 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_16.bed
Saving cluster 17 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_17.bed
Saving cluster 18 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_18.bed
Saving cluster 19 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_19.bed
Saving cluster 20 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_20.bed
Saving cluster 21 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_21.bed
Saving cluster 22 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_22.bed
Saving cluster 23 marker peaks to /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/markers/markers_23.bed
Done

Preparation data for single cell explorer

Preprocess data for single cell explorer https://ctlab.github.io/SCE

In [365]:
def export_to_cse(df, clusters_df, tsne_coords, umap_coords, top_markers,
                  sce_path, token, name, public, organism):
    print('SCE folder', sce_path)
    # Sanity check indexes
    assert np.array_equal(tsne_coords.index, umap_coords.index)
    assert np.array_equal(tsne_coords.index, df.columns)
    assert np.array_equal(tsne_coords.index, df.columns)
    assert np.array_equal(tsne_coords.index, df.columns)
    assert np.array_equal(clusters_df.index, df.columns)
    clusters = clusters_df['cluster']

    print('Processing', 'dataset.json')
    with open(os.path.join(sce_path, 'dataset.json'), 'w') as f:
        f.write(json.dumps({
            'token': token,
            'public': public,
            'name': name,
            'organism': organism,
            'cells': df.shape[1]
        }))


    print('Processing', 'plot_data.json')
    fields = {
      "tSNE_1": {
        "type": "numeric",
        "range": [
          min(tsne_coords.loc[df.columns.values]['tsne1']),
          max(tsne_coords.loc[df.columns.values]['tsne1'])
        ]
      },
      "tSNE_2": {
        "type": "numeric",
        "range": [
          min(tsne_coords.loc[df.columns.values]['tsne2']),
          max(tsne_coords.loc[df.columns.values]['tsne2'])
        ]
      },
      "UMAP_1": {
        "type": "numeric",
        "range": [
          min(umap_coords.loc[df.columns.values]['umap1']),
          max(umap_coords.loc[df.columns.values]['umap1'])
        ]
      },
      "UMAP_2": {
        "type": "numeric",
        "range": [
          min(umap_coords.loc[df.columns.values]['umap2']),
          max(umap_coords.loc[df.columns.values]['umap2'])
        ]
      },
      "Cluster": {
        "type": "factor",
        "levels": [str(c) for c in set(clusters)]
      },
      "Age": {
        "type": "factor",
        "levels": list(FACTORS_MAP.values())
      },    
      "nUmi": {
        "type": "numeric",
        "range": [
          min(df.sum()),
          max(df.sum()),
        ]
      }
    }

    data = pd.DataFrame({      
        "tSNE_1": tsne_coords.loc[df.columns.values]['tsne1'].tolist(),
        "tSNE_2": tsne_coords.loc[df.columns.values]['tsne2'].tolist(),
        "UMAP_1": umap_coords.loc[df.columns.values]['umap1'].tolist(),
        "UMAP_2": umap_coords.loc[df.columns.values]['umap2'].tolist(),
        "Cluster": [str(c) for c in clusters_df.loc[df.columns.values]['cluster']],
        "Age": [FACTORS_MAP[FACTOR_FUNCTION(bc)] for bc in df.columns],
        "nUmi": df.sum().tolist(),
        "_row": list(df.columns)
    }).to_dict('records')

    annotations = {
      "tsne_Cluster_centers": {
        "type": "text",
        "value": "Cluster",
        "coords": [
          "tSNE_1",
          "tSNE_2"
        ],
        "data": [
          {
            "Cluster": str(c),
            "tSNE_1": float(np.mean(tsne_coords.loc[df.columns.values].values[
                np.flatnonzero(clusters_df.loc[df.columns.values]['cluster'] == c), 0])),
            "tSNE_2": float(np.mean(tsne_coords.loc[df.columns.values].values[
                np.flatnonzero(clusters_df.loc[df.columns.values]['cluster'] == c), 1])),
            "Text": str(c)
          } for c in set(clusters)
        ]    
      },
    }

    with open(os.path.join(sce_path, 'plot_data.json'), 'w') as f:
        f.write(json.dumps({
            "fields": fields,
            "data": data,
            "annotations": annotations,
        }))


    # This file simply contains gene names, cell barcodes/names and total UMI per cell. 
    # This file must reflect row and column names of matrix data.h5 which contains expression data.    
    print('Processing', 'exp_data.json')
    with open(os.path.join(sce_path, 'exp_data.json'), 'w') as f:
        f.write(json.dumps({
          "genes": list(df.index.values),
          "barcodes": list(df.columns.values),
          "totalCounts": df.sum(axis=0).astype(int).tolist()
        }))


    print('Processing', 'data.h5')
    ! rm {sce_path}/data.h5
    # Fix missing locks
    os.environ['HDF5_USE_FILE_LOCKING'] = 'FALSE'
    # Don't use pandas to_hdf5 because of unused overhead.
    # IMPORTANT: transpose matrix because of different order!
    # Column-major order is used by Fortran, Matlab, R, and most underlying core linear algebra libraries (BLAS). 
    # Row-major ordering is sometimes called “C” style ordering and column-major ordering “Fortran” style. 
    # Python/NumPy refers to the orderings in array flags as C_CONTIGUOUS and F_CONTIGUOUS, respectively.
    # https://cran.r-project.org/web/packages/reticulate/vignettes/arrays.html
    expdf = df.astype(int).clip(upper=1) # Clip coverage to 0-1 for better visualization
    with h5py.File(os.path.join(sce_path, 'data.h5'), 'w') as hf:
        hf.create_dataset("expression/mat", 
                          data=expdf.T.values, # Column-major ordering!
                          compression="gzip", compression_opts=9, 
                          dtype='i4') # int4 is required here for SCE, see https://github.com/ctlab/SCE/issues/4


    print('Processing', 'markers.json')
    # Take into account only 100 top markers
    with open(os.path.join(sce_path, 'markers.json'), 'w') as f:
        f.write(json.dumps({
            "Cluster": [ 
              {
                  'X': row['gene'],
                  'p_val': row['p_value'],
                  'p_val_adj': row['adjusted_p_value'],
                  'avg_logFC': row['log2_fold_change'],
                  'pct.1': row['norm_mean_cluster'],
                  'pct.2': row['norm_mean_others'],
                  'cluster': str(row['cluster']),
                  'gene': row['peak'],
                  'distance': row['distance']
              } for index, row in top_markers.iterrows()
          ]
        }))

    print('Done')
In [366]:
sce_path = os.path.join(OUTPUT_DIR, 'sce')
! mkdir -p {sce_path}

export_to_cse(coveragedf.loc[peaks_filter], clusters_df, tsne_coords, umap_coords, 
              top_markers=get_top_markers(markers_with_genes, n_clusters, top=100),
              sce_path=sce_path,
              token='2019_scatacseq_human3vs3', 
              name='Single cell ATAC-Seq 3vs3',
              public=True,
              organism='hg')
SCE folder /mnt/stripe/shpynov/2019_gxfer1_DT1634_Nastya/pipeline/sce
Processing dataset.json
Processing plot_data.json
Processing exp_data.json
Processing data.h5
Processing markers.json
Done
In [ ]: