Skip to content

Instantly share code, notes, and snippets.

@patcon
Last active November 23, 2025 02:59
Show Gist options
  • Select an option

  • Save patcon/7d8e0c153699d46073ec105a954f7e51 to your computer and use it in GitHub Desktop.

Select an option

Save patcon/7d8e0c153699d46073ec105a954f7e51 to your computer and use it in GitHub Desktop.
My dream kedro config for generating pipelines
# Set per-run
polis_url: "${runtime_params:polis_url, null}"
# Overall
n_components: 2
min_votes_threshold: 7
# Plots
flip_x: false
flip_y: false
# Control whether to generate PNG images, beyond preview's JSON plot data
# (false by default to save significant time)
generate_plot_images: false
# If the name of a definition looks like a template (e.g., "foo_{some_param}_bar"),
# then that named parameter will be expanded into a few definitions.
# (e.g., `some_param: [1, 2]` will expand to two definitions "foo_1_bar" and "foo_2_bar")
definitions:
preprocessors:
- name: "{percent_vote_cutoff}pct"
percent_vote_cutoff: [025, 050, 075, 100]
imputers:
- name: mean
estimator: SimpleImputer
params:
strategy: mean
- name: zero
estimator: SimpleImputer
params:
strategy: constant
fill_value: 0
- name: median
estimator: SimpleImputer
params:
strategy: median
- name: mode
estimator: SimpleImputer
params:
strategy: most_frequent
- name: "knn{n_neighbors}d"
estimator: KNNImputer
params:
n_neighbors: [5, 10]
weights: distance
- name: "knn{n_neighbors}u"
estimator: KNNImputer
params:
n_neighbors: [5, 10]
weights: uniform
- name: noop
estimator: NoOpTransformer
reducers:
- name: pca
estimator: PCA
params:
n_components: ${params:n_components}
random_state: ${globals:random_state}
scaler:
estimator: SparsityAwareScaler
X_sparse: "input:masked_vote_matrix"
- name: pca_no_scale
estimator: PCA
params:
n_components: ${params:n_components}
random_state: ${globals:random_state}
- name: pacmap
estimator: PaCMAP
params:
n_components: ${params:n_components}
n_neighbors: null
random_state: ${globals:random_state}
- name: pacmap_masked
estimator: PaCMAPWithMaskedDistance
params:
n_components: ${params:n_components}
n_neighbors: null
random_state: ${globals:random_state}
- name: localmap
estimator: LocalMAP
n_components: ${params:n_components}
n_neighbors: null
random_state: ${globals:random_state}
- name: umap
estimator: UMAP
n_components: ${params:n_components}
n_neighbors: 15
random_state: ${globals:random_state}
clusterers:
- name: bestkmeans
estimator: BestKMeans
params:
k_bounds: [2, 5]
random_state: ${globals:random_state}
- name: besthdbscan
estimator: BestHDBSCANFlat
params:
# Uncommenting gets around occasional error in HDBSCAN_flat
# cluster_selection_method: leaf
k_bounds: [2, 10]
random_state: ${globals:random_state}
- name: hdbscan_eom
estimator: HDBSCAN
params:
cluster_selection_method: eom
- name: hdbscan_leaf
estimator: HDBSCAN
params:
cluster_selection_method: leaf
pipeline_variants:
main_matrix:
active: true
name: "{imputer}_{reducer}_{clusterer}"
preprocessor: 100pct
imputers: [mean, zero, median, knn5d, knn10d, knn5u, knn10u]
reducers: [pca, pacmap, localmap]
clusterers: [bestkmeans, besthdbscan, hdbscan_eom, hdbscan_leaf]
timescale:
active: true
name: "knn5d_pacmap_bestkmeans_{preprocessor}"
preprocessor: [025pct, 050pct, 075pct]
imputer: knn5d
reducer: pacmap
clusterer: bestkmeans
zero_pacmap_masked_bestkmeans:
active: true
name: zero_pacmap_masked_bestkmeans
preprocessor: 100pct
# imputation happens within the custom PaCMAP estimator
imputer: noop
reducer: pacmap_masked
clusterer: bestkmeans
mean_pca_bestkmeans_not_sparsity_aware:
active: true
preprocessor: 100pct
name: mean_pca_bestkmeans_not_sparsity_aware
imputer: mean
reducer: pca_no_scale
clusterer: bestkmeans
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment