import os
import os.path as osp
import warnings
from pprint import pformat
import cv2
import h5py
import numpy as np
import pandas as pd
import scanpy as sc
import tifffile
from anndata import AnnData
from dance import logger
from dance.data import Data
from dance.datasets.base import BaseDataset
from dance.registry import register_dataset
from dance.typing import Tuple
from dance.utils.download import download_file, download_unzip, unzip_file
[docs]@register_dataset("spatial")
class SpatialLIBDDataset(BaseDataset):
_DISPLAY_ATTRS = ("data_id", )
URL_DICT = {
"151510": "https://www.dropbox.com/sh/41h9brsk6my546x/AADa18mkJge-KQRTndRelTpMa?dl=1",
"151507": "https://www.dropbox.com/sh/m3554vfrdzbwv2c/AACGsFNVKx8rjBgvF7Pcm2L7a?dl=1",
"151508": "https://www.dropbox.com/sh/tm47u3fre8692zt/AAAJJf8-za_Lpw614ft096qqa?dl=1",
"151509": "https://www.dropbox.com/sh/hihr7906vyirjet/AACslV5mKIkF2CF5QqE1LE6ya?dl=1",
"151669": "https://www.dropbox.com/sh/ulw2nnnmgtbswvc/AAC0fT549EwtxKZWWoB89gb4a?dl=1",
"151670": "https://www.dropbox.com/sh/8fw44zyyjgh0ddc/AAA1asGAmyDiMmvhRmL7pN1Na?dl=1",
"151671": "https://www.dropbox.com/sh/9g5qzd5ykx2mpk3/AAD3xjx1i2h0RhYBc-Vft6CEa?dl=1",
"151672": "https://www.dropbox.com/sh/l6519tr280krd4p/AAAWefCSp2iKhVmLgytlyxTta?dl=1",
"151673": "https://www.dropbox.com/sh/qc64ps6gd64dm0c/AAC_5_mP4AczKj8lORLLKcIba?dl=1",
"151674": "https://www.dropbox.com/sh/q7io99psd2xuqgw/AABske8dgX_kc1oaDSxuiqjpa?dl=1",
"151675": "https://www.dropbox.com/sh/uahka2h5klnrzvj/AABe7K0_ewqOcqKUxHebE6qLa?dl=1",
"151676": "https://www.dropbox.com/sh/jos5jjurezy5zp1/AAB2uaVm3-Us1a4mDkS1Q-iAa?dl=1",
"mpb": "https://www.dropbox.com/scl/fo/psxybrqayr669yeu4ccnc/h?rlkey=9hilmps3wpso1z0xuoceiouq4&dl=1",
"pancreatic_cancer":
"https://www.dropbox.com/scl/fo/2vwvyrgou6mxptkqphlq5/h?rlkey=8pkg6mtdnv637bk35idu8bnk7&dl=1",
"human_breast_cancer":
"https://www.dropbox.com/scl/fo/hp82ugxvxro4u6e5spl1s/h?rlkey=15uqpgk6hbtrqxy9rz93b34wf&dl=1",
"sub_mba": "https://www.dropbox.com/scl/fo/ryh06rm9lwj4s2cizejzi/h?rlkey=nfrcz8m0o7cnjobhtxvkfsd0k&dl=1",
# "sub_pancreatic_cancer":"",
# "sub_human_breast_cancer":""
}
AVAILABLE_DATA = sorted(URL_DICT)
def __init__(self, root=".", full_download=False, data_id="151673", data_dir="data/spatial", sample_file=None):
super().__init__(root, full_download)
self.data_id = data_id
self.data_dir = data_dir + "/{}".format(data_id)
self.sample_file = sample_file
[docs] def download_all(self):
logger.info(f"All data includes {len(self.URL_DICT)} datasets: {list(self.URL_DICT)}")
_data_id = self.data_id
for data_id in self.URL_DICT:
self.data_id = data_id
self.download()
self.data_id = _data_id
[docs] def is_complete_all(self):
_data_id = self.data_id
for data_id in self.URL_DICT:
self.data_id = data_id
if not self.is_complete():
self.data_id = _data_id
return False
self.data_id = _data_id
return True
[docs] def download(self):
out_path = osp.join(self.data_dir, f"{self.data_id}.zip")
if download_file(self.URL_DICT[self.data_id], out_path):
unzip_file(out_path, self.data_dir)
[docs] def is_complete(self):
check = [
osp.join(self.data_dir, f"{self.data_id}_raw_feature_bc_matrix.h5"), # expression
osp.join(self.data_dir, f"{self.data_id}_full_image.tif"), # histology
osp.join(self.data_dir, "tissue_positions_list.txt"), # positions
]
for i in check:
if not os.path.exists(i):
logger.info(f"lack {i}")
return False
return True
def _load_raw_data(self):
image_path = osp.join(self.data_dir, f"{self.data_id}_full_image.tif")
data_path = osp.join(self.data_dir, f"{self.data_id}_raw_feature_bc_matrix.h5")
spatial_path = osp.join(self.data_dir, "tissue_positions_list.txt")
meta_path = osp.join(self.data_dir, "cluster_labels.csv")
logger.info(f"Loading expression data from {data_path}")
f = h5py.File(data_path, "r")
if "matrix" in list(f.keys()):
adata = sc.read_10x_h5(data_path)
else:
adata = sc.read_h5ad(data_path)
logger.info(f"Loading spatial info from {spatial_path}")
spatial_full = pd.read_csv(spatial_path, header=None, index_col=0)
spatial_full.index = spatial_full.index.astype(str)
spatial = spatial_full.loc[adata.obs_names]
logger.info(f"Loading label info from {meta_path}")
meta_df = pd.read_csv(meta_path)
# Restrict to captured spots
indicator = spatial[1].values == 1
adata = adata[indicator]
spatial = spatial.iloc[indicator]
# Prepare spatial info tables
xy = spatial[[2, 3]].rename(columns={2: "x", 3: "y"})
xy_pixel = spatial[[4, 5]].rename(columns={4: "x_pixel", 5: "y_pixel"}).astype(int)
# Prepare meta data and create a column with indexed label info
label_classes = {j: i for i, j in enumerate(meta_df["ground_truth"].unique())}
meta_df["label"] = list(map(label_classes.get, meta_df["ground_truth"]))
logger.info(f"Loading image data from {image_path}")
img = cv2.imread(image_path)
if (img == 0).all():
with tifffile.TiffFile(image_path) as tif:
img = tif.series[0].levels[0].asarray()
img = img.squeeze()
if len(img.shape) == 2:
img = np.stack([img, img, img], axis=2)
if img is None:
logger.info(f"image doesn't exist,use louvain")
return img, adata, xy, xy_pixel, meta_df
def _raw_to_dance(self, raw_data):
img, adata, xy, xy_pixel, meta_df = raw_data
adata.var_names_make_unique()
adata.obs = meta_df.set_index(adata.obs_names)
adata.obsm["spatial"] = xy.set_index(adata.obs_names)
adata.obsm["spatial_pixel"] = xy_pixel.set_index(adata.obs_names)
adata.uns["image"] = img
if self.sample_file is not None:
sample_file = osp.join(self.data_dir, self.sample_file)
with open(sample_file) as file:
sample_index = [int(line.strip()) for line in file]
adata = adata[sample_index]
data = Data(adata, train_size="all")
return data
[docs]@register_dataset("spatial")
class CellTypeDeconvoDataset(BaseDataset):
"""Load raw data.
Parameters
----------
subset_common_celltypes
If set to True, then subset both the reference and the real data to contain only cell types that are
present in both reference and real.
"""
_DISPLAY_ATTRS = ("data_id", "subset_common_celltypes")
_IGNORE_FILES = ["readme.txt"]
URL_DICT = {
"CARD_synthetic": "https://www.dropbox.com/sh/v0vpv0jsnfexj7f/AADpizLGOrF7M8EesDihgbBla?dl=1",
"GSE174746": "https://www.dropbox.com/sh/spfv06yfttetrab/AAAgORS6ocyoZEyxiRYKTymCa?dl=1",
"SPOTLight_synthetic": "https://www.dropbox.com/sh/p1tfb0xe1yl2zpe/AAB6cF-BsdJcHToet_C-AlXAa?dl=1",
"human PDAC": "https://www.dropbox.com/sh/9py6hk9j1ygyprh/AAAOKTo-TE_eX4JJg0HIFfZ7a?dl=1",
"mouse brain 1": "https://www.dropbox.com/sh/e2nl247v1jrd7h8/AAC1IUlk_3vXUvfk2fv9L2D3a?dl=1",
"lung6": "https://www.dropbox.com/scl/fo/ak4y1nu6ozi93cspmmkm1/h?rlkey=j5m62jxprcozskov3q0pzjdzv&dl=1",
"lung9_1": "https://www.dropbox.com/scl/fo/9ektj6rhlnza4ox7a73e2/h?rlkey=kichva2ag2kc2s64vx5wcyt4e&dl=1",
"lung9_2": "https://www.dropbox.com/scl/fo/lfhu461ls43ehj3y6uxj1/h?rlkey=7ft9y03rj4klwvdcfm6s6kzvt&dl=1",
"hcc_liver": "https://www.dropbox.com/scl/fo/txi4jumgero25n929zssp/h?rlkey=23bzxg1a7f0w1orjkifz8zfo5&dl=1",
"normal_liver": "https://www.dropbox.com/scl/fo/23qcxc0ibc6ki3hc3dixo/h?rlkey=0os9js49ut01g1fntn0i5m7zk&dl=1",
"kidney_1139": "https://www.dropbox.com/scl/fo/ek7d6sdpugr3wqjf30k3l/h?rlkey=ey0vwkf1v9u5h6ea1hx5ldvsz&dl=1",
"kidney_10838": "https://www.dropbox.com/scl/fo/p7vs52wdmqxfl8aah7vx1/h?rlkey=5jqo4p6s8nsqjgjbhzs91sukb&dl=1",
"kidney_3323": "https://www.dropbox.com/scl/fo/zmjrg8h8bradykumx60q8/h?rlkey=8amhfq4vljw588l4sr355buk9&dl=1",
"kidney_642": "https://www.dropbox.com/scl/fo/6g8eaftskncv8vj2d1zp9/h?rlkey=td5vqltxiyn3mrvlt769n3hyb&dl=1",
"kidney_8693": "https://www.dropbox.com/scl/fo/rzujsz9hza1uak1fi3e64/h?rlkey=6bnkqj0isva36wxfol49kxrtl&dl=1",
"kidney_2566": "https://www.dropbox.com/scl/fo/f3ms854h7j1j6z78mep92/h?rlkey=id7l6uny3m4ut0bgqc6ozgdkx&dl=1",
"kidney_213": "https://www.dropbox.com/scl/fo/1ntibep89ph0wmn5ml6o6/h?rlkey=of0dci4fz50xkur2xkpbdxm7l&dl=1",
"kidney_4061": "https://www.dropbox.com/scl/fo/abzhq1dr83n1azhk5v62x/h?rlkey=jrmy26d631wd9y3rmae4stlvi&dl=1",
"kidney_1098": "https://www.dropbox.com/scl/fo/lslmdwodhfinnj1v81x5t/h?rlkey=eczhxo4r8flnrg02ub26oxjon&dl=1",
"kidney_8471": "https://www.dropbox.com/scl/fo/eoxwm0h5oexv60anhobq9/h?rlkey=wpajrub5kqpz7n0enz6l8zw7i&dl=1"
}
AVAILABLE_DATA = sorted(URL_DICT)
def __init__(self, data_dir="data/spatial", data_id="GSE174746", subset_common_celltypes: bool = True):
super().__init__(data_dir)
if data_id not in self.URL_DICT:
raise ValueError(f"Unknown data_id {data_id!r}, available datasets are: {self.AVAILABLE_DATA}")
self.data_id = data_id
self.data_url = self.URL_DICT[data_id]
self.data_dir = osp.join(data_dir, data_id)
self.subset_common_celltypes = subset_common_celltypes
[docs] def download(self):
download_unzip(self.data_url, self.data_dir)
[docs] def is_complete(self):
return osp.exists(self.data_dir)
def _load_raw_data(self) -> Tuple[pd.DataFrame, ...]:
raw_data_dict = {}
for f in os.listdir(self.data_dir):
filepath = osp.join(self.data_dir, f)
filename, ext = osp.splitext(f)
if f in self._IGNORE_FILES:
continue
elif ext == ".csv":
raw_data_dict[filename] = pd.read_csv(filepath, header=0, index_col=0)
elif ext == ".h5ad":
raw_data_dict[filename] = sc.read_h5ad(filepath).to_df()
else:
warnings.warn(f"Unsupported file type {ext!r}. Only csv or h5ad are supported now.")
if ext == ".csv" or ext == ".h5ad":
raw_data_dict[filename].index = raw_data_dict[filename].index.astype(str)
ref_count = raw_data_dict["ref_sc_count"]
ref_annot = raw_data_dict["ref_sc_annot"]
count_matrix = raw_data_dict["mix_count"]
cell_type_portion = raw_data_dict["true_p"]
if (spatial := raw_data_dict.get("spatial_location")) is None:
spatial = pd.DataFrame(0, index=count_matrix.index, columns=["x", "y"])
# Obtain cell type info and subset to common cell types between ref and real if needed
ref_celltypes = set(ref_annot["cellType"].unique().tolist())
real_celltypes = set(cell_type_portion.columns.tolist())
logger.info(f"Number of cell types: reference = {len(ref_celltypes)}, real = {len(real_celltypes)}")
if self.subset_common_celltypes:
common_celltypes = sorted(ref_celltypes & real_celltypes)
logger.info(f"Subsetting to common cell types (n={len(common_celltypes)}):\n{pformat(common_celltypes)}")
idx = ref_annot[ref_annot["cellType"].isin(common_celltypes)].index
ref_annot = ref_annot.loc[idx]
ref_count = ref_count.loc[idx]
cell_type_portion = cell_type_portion[common_celltypes]
return ref_count, ref_annot, count_matrix, cell_type_portion, spatial
def _raw_to_dance(self, raw_data: Tuple[pd.DataFrame, ...]):
ref_count, ref_annot, count_matrix, cell_type_portion, spatial = raw_data
adata_inf = AnnData(
count_matrix.values,
dtype=np.float32,
obs=pd.DataFrame(index=count_matrix.index.tolist()),
var=pd.DataFrame(index=count_matrix.columns.tolist()),
)
adata_inf.obsm["cell_type_portion"] = cell_type_portion.astype(np.float32)
adata_inf.obsm["spatial"] = spatial.astype(np.float32)
adata_ref = AnnData(
ref_count.values,
dtype=np.float32,
obs=ref_annot,
var=pd.DataFrame(index=ref_count.columns.tolist()),
)
# FIX: If we switch the order of the append bewlo, i.e., append inf to ref, we get the following error
# ValueError: Length mismatch: Expected axis has 520 elements, new values have 10454 elements
# This is possibly a BUG in the anndata package.
data = Data(adata_inf, full_split_name="test")
data.append(Data(adata_ref, full_split_name="ref"), join="outer", label_batch=True)
return data