Search Space of Preprocessing Function Parameters

After identifying the optimal preprocessing function in Stage 2, Stage 3 involves parameter tuning for this function. The parameter search is guided by a configuration file that defines the search space, as detailed below. This file is user-editable to allow for customization.

examples/tuning/step3_default_params.yaml
---
type: preprocessor
tune_mode: params
pipeline:
#filter.gene
  - type: filter.gene
    target: FilterGenesPercentile
    params_to_tune:
      min_val:
        min: 1
        max: 10
      max_val:
        min: 95
        max: 99
      mode:
        values: [sum, var, cv, rv]
  - type: filter.gene
    target: FilterGenesScanpyOrder
    params_to_tune:
      order:
        values:
          - [min_counts, min_cells, max_counts, max_cells]
          - [min_counts, min_cells, max_cells, max_counts]
          - [min_counts, max_counts, min_cells, max_cells]
          - [min_counts, max_counts, max_cells, min_cells]
          - [min_counts, max_cells, min_cells, max_counts]
          - [min_counts, max_cells, max_counts, min_cells]
          - [min_cells, min_counts, max_counts, max_cells]
          - [min_cells, min_counts, max_cells, max_counts]
          - [min_cells, max_counts, min_counts, max_cells]
          - [min_cells, max_counts, max_cells, min_counts]
          - [min_cells, max_cells, min_counts, max_counts]
          - [min_cells, max_cells, max_counts, min_counts]
          - [max_counts, min_counts, min_cells, max_cells]
          - [max_counts, min_counts, max_cells, min_cells]
          - [max_counts, min_cells, min_counts, max_cells]
          - [max_counts, min_cells, max_cells, min_counts]
          - [max_counts, max_cells, min_counts, min_cells]
          - [max_counts, max_cells, min_cells, min_counts]
          - [max_cells, min_counts, min_cells, max_counts]
          - [max_cells, min_counts, max_counts, min_cells]
          - [max_cells, min_cells, min_counts, max_counts]
          - [max_cells, min_cells, max_counts, min_counts]
          - [max_cells, max_counts, min_counts, min_cells]
          - [max_cells, max_counts, min_cells, min_counts]
            # cta_problem
            # min_counts:
            #   min: 1
            #   max: 10
            # min_cells:
            #   min: 1
            #   max: 10
            # max_counts:
            #   min: 500
            #   max: 5000
            # max_cells:
            #   min: 500
            #   max: 5000
      min_counts:
        min: 3
        max: 500
      min_cells:
        min: 0.0
        max: 0.1
      max_counts:
        min: 0.9
        max: 1.0
      max_cells:
        min: 0.95
        max: 1.0
  - type: filter.gene
    target: FilterGenesPlaceHolder


#filter.cell
  - type: filter.cell
    target: FilterCellsScanpyOrder
    params_to_tune:
      order:
        values:
          - [min_counts, min_genes, max_counts, max_genes]
          - [min_counts, min_genes, max_genes, max_counts]
          - [min_counts, max_counts, min_genes, max_genes]
          - [min_counts, max_counts, max_genes, min_genes]
          - [min_counts, max_genes, min_genes, max_counts]
          - [min_counts, max_genes, max_counts, min_genes]
          - [min_genes, min_counts, max_counts, max_genes]
          - [min_genes, min_counts, max_genes, max_counts]
          - [min_genes, max_counts, min_counts, max_genes]
          - [min_genes, max_counts, max_genes, min_counts]
          - [min_genes, max_genes, min_counts, max_counts]
          - [min_genes, max_genes, max_counts, min_counts]
          - [max_counts, min_counts, min_genes, max_genes]
          - [max_counts, min_counts, max_genes, min_genes]
          - [max_counts, min_genes, min_counts, max_genes]
          - [max_counts, min_genes, max_genes, min_counts]
          - [max_counts, max_genes, min_counts, min_genes]
          - [max_counts, max_genes, min_genes, min_counts]
          - [max_genes, min_counts, min_genes, max_counts]
          - [max_genes, min_counts, max_counts, min_genes]
          - [max_genes, min_genes, min_counts, max_counts]
          - [max_genes, min_genes, max_counts, min_counts]
          - [max_genes, max_counts, min_counts, min_genes]
          - [max_genes, max_counts, min_genes, min_counts]
      min_counts:
        min: 0.0  # Change occurs when joint embedding
        max: 0.05
      min_genes:
        min: 0.0
        max: 0.05
      max_counts:
        min: 0.95
        max: 1.0
      max_genes:
        min: 0.95
        max: 1.0
  - type: filter.cell
    target: FilterCellsPlaceHolder
  - type: filter.cell
    target: FilterCellsCommonMod


#normalize
  - type: normalize
    target: ColumnSumNormalize
    params_to_tune:
      mode:
        values: [normalize, standardize, minmax, l2]
      eps:
        values: [-1, 0.1, 0.3, 0.5, 0.7]
  - type: normalize
    target: ScTransform
    params_to_tune:
      min_cells:
        min: 1
        max: 10
      gmean_eps:
        min: 1
        max: 10
      n_genes:
        min: 1000
        max: 3000
      n_cells:
        values: [null, 1, 10, 100]
      bin_size:
        min: 300
        max: 800
      bw_adjust:
        min: 1.0
        max: 5.0
    params:
      processes_num: 8
  - type: normalize
    target: Log1P
    params_to_tune:
      base:
        min: 1.0
        max: 10.0
  - type: normalize
    target: NormalizeTotal
    params_to_tune:
      target_sum:
        values: [null, 1e3, 1e4, 1e5, 1e6]
      max_fraction:
        values: [0.01, 0.05, 0.5, 0.7, 1.0]
  - type: normalize
    target: NormalizeTotalLog1P
    params_to_tune:
      base:
        min: 1.0
        max: 10.0
      target_sum:
        values: [null, 1e3, 1e4, 1e5, 1e6]
      max_fraction:
        values: [0.01, 0.05, 0.5, 0.7, 1.0]
  - type: normalize
    target: tfidfTransform
  - type: normalize
    target: NormalizePlaceHolder


#filter.gene(highly_variable)
  - type: filter.gene
    target: FilterGenesTopK
    params_to_tune:
      num_genes:
        min: 100
        max: 10000
      top:
        values: [true, false]
      mode:
        values: [sum, var, cv, rv]
  - type: filter.gene
    target: FilterGenesRegression
    params_to_tune:
      method:
        values: [enclasc, seurat3, scmap]
      num_genes:
        min: 100
        max: 10000
  - type: filter.gene
    target: FilterGenesMatch
    params:
      prefixes: [ERCC, MT-]
  - type: filter.gene
    target: HighlyVariableGenesRawCount
    params_to_tune:
      n_top_genes:
        min: 100
        max: 10000
      span:
        min: 0.1
        max: 0.6
  - type: filter.gene
    target: HighlyVariableGenesLogarithmizedByTopGenes
    params_to_tune:
      n_top_genes:
        min: 100
        max: 10000
      n_bins:
        min: 10
        max: 30
      flavor:
        values: [seurat, cell_ranger]
  - type: filter.gene
    target: HighlyVariableGenesLogarithmizedByMeanAndDisp
    params_to_tune:
      min_disp:
        min: 0.05
        max: 0.5
      max_disp:
        min: 1.0
        max: 100.0
      min_mean:
        min: 0.0
        max: 0.0125
      max_mean:
        min: 3.0
        max: 20.0
      n_bins:
        min: 10
        max: 30
      flavor:
        values: [seurat, cell_ranger]
  - type: filter.gene
    target: FilterGenesNumberPlaceHolder

#feature.cell
  - type: feature.cell
    target: CellPCA
    params:
      out: feature.cell
    params_to_tune:
      n_components:
        min: 100
        max: 1000
      svd_solver:
        values: [auto, full, arpack, randomized]
  - type: feature.cell
    target: CellSVD
    params:
      out: feature.cell
    params_to_tune:
      n_components:
        min: 100
        max: 1000
      algorithm:
        values: [arpack, randomized]
  - type: feature.cell
    target: CellSparsePCA
    params:
      out: feature.cell
    params_to_tune:
      n_components:
        min: 100
        max: 1000
  - type: feature.cell
    target: WeightedFeaturePCA
    params:
      out: feature.cell
    params_to_tune:
      n_components:
        min: 100
        max: 1000
      feat_norm_mode:
        values: [null, normalize, standardize, minmax, l2]
  - type: feature.cell
    target: WeightedFeatureSVD
    params:
      out: feature.cell
    params_to_tune:
      n_components:
        min: 100
        max: 1000
      feat_norm_mode:
        values: [null, normalize, standardize, minmax, l2]
  - type: feature.cell
    target: GaussRandProjFeature
    params:
      out: feature.cell
      log_level: INFO
    params_to_tune:
      n_components:
        min: 100
        max: 1000
  - type: feature.cell
    target: FeatureCellPlaceHolder
    params:
      out: feature.cell
wandb:
  entity: xzy11632
  project: dance-dev
  method: bayes
  metric:
    name: acc  # val/acc
    goal: maximize