Source code for dance.datasets.spatial

import os
import os.path as osp
import warnings
from pprint import pformat

import cv2
import h5py
import numpy as np
import pandas as pd
import scanpy as sc
import tifffile
from anndata import AnnData

from dance import logger
from dance.data import Data
from dance.datasets.base import BaseDataset
from dance.registry import register_dataset
from dance.typing import Tuple
from dance.utils.download import download_file, download_unzip, unzip_file


[docs]@register_dataset("spatial") class SpatialLIBDDataset(BaseDataset): _DISPLAY_ATTRS = ("data_id", ) URL_DICT = { "151510": "https://www.dropbox.com/sh/41h9brsk6my546x/AADa18mkJge-KQRTndRelTpMa?dl=1", "151507": "https://www.dropbox.com/sh/m3554vfrdzbwv2c/AACGsFNVKx8rjBgvF7Pcm2L7a?dl=1", "151508": "https://www.dropbox.com/sh/tm47u3fre8692zt/AAAJJf8-za_Lpw614ft096qqa?dl=1", "151509": "https://www.dropbox.com/sh/hihr7906vyirjet/AACslV5mKIkF2CF5QqE1LE6ya?dl=1", "151669": "https://www.dropbox.com/sh/ulw2nnnmgtbswvc/AAC0fT549EwtxKZWWoB89gb4a?dl=1", "151670": "https://www.dropbox.com/sh/8fw44zyyjgh0ddc/AAA1asGAmyDiMmvhRmL7pN1Na?dl=1", "151671": "https://www.dropbox.com/sh/9g5qzd5ykx2mpk3/AAD3xjx1i2h0RhYBc-Vft6CEa?dl=1", "151672": "https://www.dropbox.com/sh/l6519tr280krd4p/AAAWefCSp2iKhVmLgytlyxTta?dl=1", "151673": "https://www.dropbox.com/sh/qc64ps6gd64dm0c/AAC_5_mP4AczKj8lORLLKcIba?dl=1", "151674": "https://www.dropbox.com/sh/q7io99psd2xuqgw/AABske8dgX_kc1oaDSxuiqjpa?dl=1", "151675": "https://www.dropbox.com/sh/uahka2h5klnrzvj/AABe7K0_ewqOcqKUxHebE6qLa?dl=1", "151676": "https://www.dropbox.com/sh/jos5jjurezy5zp1/AAB2uaVm3-Us1a4mDkS1Q-iAa?dl=1", "mpb": "https://www.dropbox.com/scl/fo/psxybrqayr669yeu4ccnc/h?rlkey=9hilmps3wpso1z0xuoceiouq4&dl=1", "pancreatic_cancer": "https://www.dropbox.com/scl/fo/2vwvyrgou6mxptkqphlq5/h?rlkey=8pkg6mtdnv637bk35idu8bnk7&dl=1", "human_breast_cancer": "https://www.dropbox.com/scl/fo/hp82ugxvxro4u6e5spl1s/h?rlkey=15uqpgk6hbtrqxy9rz93b34wf&dl=1", "sub_mba": "https://www.dropbox.com/scl/fo/ryh06rm9lwj4s2cizejzi/h?rlkey=nfrcz8m0o7cnjobhtxvkfsd0k&dl=1", # "sub_pancreatic_cancer":"", # "sub_human_breast_cancer":"" } AVAILABLE_DATA = sorted(URL_DICT) def __init__(self, root=".", full_download=False, data_id="151673", data_dir="data/spatial", sample_file=None): super().__init__(root, full_download) self.data_id = data_id self.data_dir = data_dir + "/{}".format(data_id) self.sample_file = sample_file
[docs] def download_all(self): logger.info(f"All data includes {len(self.URL_DICT)} datasets: {list(self.URL_DICT)}") _data_id = self.data_id for data_id in self.URL_DICT: self.data_id = data_id self.download() self.data_id = _data_id
[docs] def is_complete_all(self): _data_id = self.data_id for data_id in self.URL_DICT: self.data_id = data_id if not self.is_complete(): self.data_id = _data_id return False self.data_id = _data_id return True
[docs] def download(self): out_path = osp.join(self.data_dir, f"{self.data_id}.zip") if download_file(self.URL_DICT[self.data_id], out_path): unzip_file(out_path, self.data_dir)
[docs] def is_complete(self): check = [ osp.join(self.data_dir, f"{self.data_id}_raw_feature_bc_matrix.h5"), # expression osp.join(self.data_dir, f"{self.data_id}_full_image.tif"), # histology osp.join(self.data_dir, "tissue_positions_list.txt"), # positions ] for i in check: if not os.path.exists(i): logger.info(f"lack {i}") return False return True
def _load_raw_data(self): image_path = osp.join(self.data_dir, f"{self.data_id}_full_image.tif") data_path = osp.join(self.data_dir, f"{self.data_id}_raw_feature_bc_matrix.h5") spatial_path = osp.join(self.data_dir, "tissue_positions_list.txt") meta_path = osp.join(self.data_dir, "cluster_labels.csv") logger.info(f"Loading expression data from {data_path}") f = h5py.File(data_path, "r") if "matrix" in list(f.keys()): adata = sc.read_10x_h5(data_path) else: adata = sc.read_h5ad(data_path) logger.info(f"Loading spatial info from {spatial_path}") spatial_full = pd.read_csv(spatial_path, header=None, index_col=0) spatial_full.index = spatial_full.index.astype(str) spatial = spatial_full.loc[adata.obs_names] logger.info(f"Loading label info from {meta_path}") meta_df = pd.read_csv(meta_path) # Restrict to captured spots indicator = spatial[1].values == 1 adata = adata[indicator] spatial = spatial.iloc[indicator] # Prepare spatial info tables xy = spatial[[2, 3]].rename(columns={2: "x", 3: "y"}) xy_pixel = spatial[[4, 5]].rename(columns={4: "x_pixel", 5: "y_pixel"}).astype(int) # Prepare meta data and create a column with indexed label info label_classes = {j: i for i, j in enumerate(meta_df["ground_truth"].unique())} meta_df["label"] = list(map(label_classes.get, meta_df["ground_truth"])) logger.info(f"Loading image data from {image_path}") img = cv2.imread(image_path) if (img == 0).all(): with tifffile.TiffFile(image_path) as tif: img = tif.series[0].levels[0].asarray() img = img.squeeze() if len(img.shape) == 2: img = np.stack([img, img, img], axis=2) if img is None: logger.info(f"image doesn't exist,use louvain") return img, adata, xy, xy_pixel, meta_df def _raw_to_dance(self, raw_data): img, adata, xy, xy_pixel, meta_df = raw_data adata.var_names_make_unique() adata.obs = meta_df.set_index(adata.obs_names) adata.obsm["spatial"] = xy.set_index(adata.obs_names) adata.obsm["spatial_pixel"] = xy_pixel.set_index(adata.obs_names) adata.uns["image"] = img if self.sample_file is not None: sample_file = osp.join(self.data_dir, self.sample_file) with open(sample_file) as file: sample_index = [int(line.strip()) for line in file] adata = adata[sample_index] data = Data(adata, train_size="all") return data
[docs]@register_dataset("spatial") class CellTypeDeconvoDataset(BaseDataset): """Load raw data. Parameters ---------- subset_common_celltypes If set to True, then subset both the reference and the real data to contain only cell types that are present in both reference and real. """ _DISPLAY_ATTRS = ("data_id", "subset_common_celltypes") _IGNORE_FILES = ["readme.txt"] URL_DICT = { "CARD_synthetic": "https://www.dropbox.com/sh/v0vpv0jsnfexj7f/AADpizLGOrF7M8EesDihgbBla?dl=1", "GSE174746": "https://www.dropbox.com/sh/spfv06yfttetrab/AAAgORS6ocyoZEyxiRYKTymCa?dl=1", "SPOTLight_synthetic": "https://www.dropbox.com/sh/p1tfb0xe1yl2zpe/AAB6cF-BsdJcHToet_C-AlXAa?dl=1", "human PDAC": "https://www.dropbox.com/sh/9py6hk9j1ygyprh/AAAOKTo-TE_eX4JJg0HIFfZ7a?dl=1", "mouse brain 1": "https://www.dropbox.com/sh/e2nl247v1jrd7h8/AAC1IUlk_3vXUvfk2fv9L2D3a?dl=1", "lung6": "https://www.dropbox.com/scl/fo/ak4y1nu6ozi93cspmmkm1/h?rlkey=j5m62jxprcozskov3q0pzjdzv&dl=1", "lung9_1": "https://www.dropbox.com/scl/fo/9ektj6rhlnza4ox7a73e2/h?rlkey=kichva2ag2kc2s64vx5wcyt4e&dl=1", "lung9_2": "https://www.dropbox.com/scl/fo/lfhu461ls43ehj3y6uxj1/h?rlkey=7ft9y03rj4klwvdcfm6s6kzvt&dl=1", "hcc_liver": "https://www.dropbox.com/scl/fo/txi4jumgero25n929zssp/h?rlkey=23bzxg1a7f0w1orjkifz8zfo5&dl=1", "normal_liver": "https://www.dropbox.com/scl/fo/23qcxc0ibc6ki3hc3dixo/h?rlkey=0os9js49ut01g1fntn0i5m7zk&dl=1", "kidney_1139": "https://www.dropbox.com/scl/fo/ek7d6sdpugr3wqjf30k3l/h?rlkey=ey0vwkf1v9u5h6ea1hx5ldvsz&dl=1", "kidney_10838": "https://www.dropbox.com/scl/fo/p7vs52wdmqxfl8aah7vx1/h?rlkey=5jqo4p6s8nsqjgjbhzs91sukb&dl=1", "kidney_3323": "https://www.dropbox.com/scl/fo/zmjrg8h8bradykumx60q8/h?rlkey=8amhfq4vljw588l4sr355buk9&dl=1", "kidney_642": "https://www.dropbox.com/scl/fo/6g8eaftskncv8vj2d1zp9/h?rlkey=td5vqltxiyn3mrvlt769n3hyb&dl=1", "kidney_8693": "https://www.dropbox.com/scl/fo/rzujsz9hza1uak1fi3e64/h?rlkey=6bnkqj0isva36wxfol49kxrtl&dl=1", "kidney_2566": "https://www.dropbox.com/scl/fo/f3ms854h7j1j6z78mep92/h?rlkey=id7l6uny3m4ut0bgqc6ozgdkx&dl=1", "kidney_213": "https://www.dropbox.com/scl/fo/1ntibep89ph0wmn5ml6o6/h?rlkey=of0dci4fz50xkur2xkpbdxm7l&dl=1", "kidney_4061": "https://www.dropbox.com/scl/fo/abzhq1dr83n1azhk5v62x/h?rlkey=jrmy26d631wd9y3rmae4stlvi&dl=1", "kidney_1098": "https://www.dropbox.com/scl/fo/lslmdwodhfinnj1v81x5t/h?rlkey=eczhxo4r8flnrg02ub26oxjon&dl=1", "kidney_8471": "https://www.dropbox.com/scl/fo/eoxwm0h5oexv60anhobq9/h?rlkey=wpajrub5kqpz7n0enz6l8zw7i&dl=1" } AVAILABLE_DATA = sorted(URL_DICT) def __init__(self, data_dir="data/spatial", data_id="GSE174746", subset_common_celltypes: bool = True): super().__init__(data_dir) if data_id not in self.URL_DICT: raise ValueError(f"Unknown data_id {data_id!r}, available datasets are: {self.AVAILABLE_DATA}") self.data_id = data_id self.data_url = self.URL_DICT[data_id] self.data_dir = osp.join(data_dir, data_id) self.subset_common_celltypes = subset_common_celltypes
[docs] def download(self): download_unzip(self.data_url, self.data_dir)
[docs] def is_complete(self): return osp.exists(self.data_dir)
def _load_raw_data(self) -> Tuple[pd.DataFrame, ...]: raw_data_dict = {} for f in os.listdir(self.data_dir): filepath = osp.join(self.data_dir, f) filename, ext = osp.splitext(f) if f in self._IGNORE_FILES: continue elif ext == ".csv": raw_data_dict[filename] = pd.read_csv(filepath, header=0, index_col=0) elif ext == ".h5ad": raw_data_dict[filename] = sc.read_h5ad(filepath).to_df() else: warnings.warn(f"Unsupported file type {ext!r}. Only csv or h5ad are supported now.") if ext == ".csv" or ext == ".h5ad": raw_data_dict[filename].index = raw_data_dict[filename].index.astype(str) ref_count = raw_data_dict["ref_sc_count"] ref_annot = raw_data_dict["ref_sc_annot"] count_matrix = raw_data_dict["mix_count"] cell_type_portion = raw_data_dict["true_p"] if (spatial := raw_data_dict.get("spatial_location")) is None: spatial = pd.DataFrame(0, index=count_matrix.index, columns=["x", "y"]) # Obtain cell type info and subset to common cell types between ref and real if needed ref_celltypes = set(ref_annot["cellType"].unique().tolist()) real_celltypes = set(cell_type_portion.columns.tolist()) logger.info(f"Number of cell types: reference = {len(ref_celltypes)}, real = {len(real_celltypes)}") if self.subset_common_celltypes: common_celltypes = sorted(ref_celltypes & real_celltypes) logger.info(f"Subsetting to common cell types (n={len(common_celltypes)}):\n{pformat(common_celltypes)}") idx = ref_annot[ref_annot["cellType"].isin(common_celltypes)].index ref_annot = ref_annot.loc[idx] ref_count = ref_count.loc[idx] cell_type_portion = cell_type_portion[common_celltypes] return ref_count, ref_annot, count_matrix, cell_type_portion, spatial def _raw_to_dance(self, raw_data: Tuple[pd.DataFrame, ...]): ref_count, ref_annot, count_matrix, cell_type_portion, spatial = raw_data adata_inf = AnnData( count_matrix.values, dtype=np.float32, obs=pd.DataFrame(index=count_matrix.index.tolist()), var=pd.DataFrame(index=count_matrix.columns.tolist()), ) adata_inf.obsm["cell_type_portion"] = cell_type_portion.astype(np.float32) adata_inf.obsm["spatial"] = spatial.astype(np.float32) adata_ref = AnnData( ref_count.values, dtype=np.float32, obs=ref_annot, var=pd.DataFrame(index=ref_count.columns.tolist()), ) # FIX: If we switch the order of the append bewlo, i.e., append inf to ref, we get the following error # ValueError: Length mismatch: Expected axis has 520 elements, new values have 10454 elements # This is possibly a BUG in the anndata package. data = Data(adata_inf, full_split_name="test") data.append(Data(adata_ref, full_split_name="ref"), join="outer", label_batch=True) return data