buckaroo-data · paddymul · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
@@ -34,6 +34,41 @@ jobs:
       - name: Run paddy-format check
         run: find buckaroo tests scripts -type f -name "*.py" -not -path "buckaroo/jlisp/lispy.py" -print0 | xargs -0 uv run python scripts/paddy_format.py --check
 
+  # Non-blocking static type check (continue-on-error) over the files touched
+  # by the dataflow generic-typing work. Scoped + standard mode are both
+  # deliberate: pyrightconfig.typecheck.json lists exactly those files and runs
+  # basedpyright's "standard" mode rather than its stricter "recommended"
+  # default, so the signal stays on real type errors instead of third-party
+  # stub noise. Editors keep using the [tool.basedpyright] block in
+  # pyproject.toml — that file's name is not auto-discovered.
+  TypeCheck:
+    name: Python / Typecheck (non-blocking)
+    runs-on: depot-ubuntu-latest
+    timeout-minutes: 5
+    continue-on-error: true
+    steps:
+      - uses: actions/checkout@v6
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+      - name: Install the project
+        # --all-extras so basedpyright can resolve polars/xorq/pyarrow imports.
+        run: uv sync --locked --all-extras --dev
+      - name: Run basedpyright (scoped, informational)
+        # Informational only: these files carry a large pre-existing backlog of
+        # standard-mode diagnostics, so the step is forced green and the totals
+        # go to the job summary. The full report is in the step log. `|| true`
+        # keeps the check green; job-level continue-on-error covers setup steps.
+        run: |
+          uv run --with basedpyright==1.39.8 basedpyright --project pyrightconfig.typecheck.json 2>&1 | tee bpr.log || true
+          {
+            echo "## basedpyright (standard mode, dataflow typing files — non-blocking)"
+            echo '```'
+            tail -n 1 bpr.log
+            echo '```'
+          } >> "$GITHUB_STEP_SUMMARY"
+
   TestJS:
     name: JS / Build + Test
     runs-on: depot-ubuntu-latest

diff --git a/buckaroo/buckaroo_widget.py b/buckaroo/buckaroo_widget.py
@@ -11,14 +11,14 @@
 import sys
 import traceback
 from datetime import datetime
-from typing import Literal, Union
+from typing import Any as TAny, ClassVar, Literal, Type, Union
 import pandas as pd
 import json
 import logging
 import random
 import string

 from traitlets import List, Dict, observe, Unicode, Any, Bool
 import anywidget

 from .customizations.pd_stats_v2 import PD_ANALYSIS_V2
@@ -35,6 +35,10 @@
 from .dataflow.autocleaning import PandasAutocleaning
 from pathlib import Path
 
+# pandas backend binding of the generic dataflow. Mirrors PolarsDataflow
+# (polars_buckaroo) and XorqDataflow (xorq_buckaroo).
+PandasDataflow = CustomizableDataflow[pd.DataFrame]
+
 logger = logging.getLogger()
 
 # Opt-in cross-boundary tracing. Set BUCKAROO_BK_FLASH=1 in the kernel
@@ -54,7 +58,7 @@

 class PdSampling(Sampling):
    @classmethod
    def pre_stats_sample(kls, df):
        # this is a bad place for fixing the dataframe, but for now
        # it's expedient. There probably should be a nother processing
        # step
@@ -111,7 +115,7 @@
              'secondary_df_viewer_config': EMPTY_DFVIEWER_CONFIG}}
        args_dict['args']['summary_stats_data'] = []
        if include_summary_stats:
            1/0 # not supported yet
            # summary_stats data is big, and most of the time you won't want to serialize it
            #args_dict['summary_stats_data'] = {} #desrialize here

@@ -148,7 +152,7 @@
            autoclean_conf= kls.autoclean_conf
            analysis_klasses = kls.analysis_klasses

            def _df_to_obj(idfself, df:pd.DataFrame):
                return self._df_to_obj(df)

        self.dataflow = InnerDataFlow(
@@ -184,11 +188,14 @@
     autocleaning_klass = PandasAutocleaning #override the base CustomizableDataFlow klass
     DFStatsClass = DfStatsV2 # Pandas Specific
     autoclean_conf = tuple([CleaningConf, NoCleaningConf]) #override the base CustomizableDataFlow conf
-    dataflow_klass = CustomizableDataflow
+    # The composed dataflow's backend binding. Typed broadly here (one slot,
+    # specialised per backend subclass) — the precise binding lives in the
+    # named alias/subclass assigned: PandasDataflow / PolarsDataflow / XorqDataflow.
+    dataflow_klass: ClassVar[Type[CustomizableDataflow[TAny]]] = PandasDataflow
 
 
     df_data_dict = Dict({}).tag(sync=True)
    df_display_args: DisplayArgs = Dict({}).tag(sync=True)
    #information about the dataframe
    df_meta = Dict({
        'columns': 5, # dummy data
@@ -247,7 +254,7 @@
        class DecoratedProcessing(ColAnalysis):
            provides_defaults = {}
            @classmethod
            def post_process_df(kls, df):
                new_df = df_processing_func(df)
                return [new_df, {}]
            post_processing_method = proc_func_name
@@ -280,7 +287,7 @@
        {'a':  5  , 'b':20, 'c': 'Paddy'},
        {'a': 58.2, 'b': 9, 'c': 'Margaret'}]).tag(sync=True)

    df_viewer_config: DFViewerConfig = Dict({
        'column_config': [],
        'pinned_rows': [],
        'first_col_configs':[]}).tag(sync=True)
@@ -378,7 +385,7 @@
        if self.dataflow.component_config:
            temp_display_args['main']['df_viewer_config']['component_config'] = self.dataflow.component_config

        self.df_display_args = temp_display_args
        _bk_flash("_handle_widget_change EXIT (df_display_args → JS)")



diff --git a/buckaroo/dataflow/abc_dataflow.py b/buckaroo/dataflow/abc_dataflow.py
@@ -3,41 +3,72 @@
 from __future__ import annotations
 
 from abc import ABCMeta, abstractmethod
-from typing import Any, Dict, List, Type
+from typing import Any, Generic, List, Optional, Type
 
 from traitlets import HasTraits, MetaHasTraits
 
 from buckaroo.pluggable_analysis_framework.col_analysis import ColAnalysis
 
+from .df_types import FrameT
+
 
 class _ABCMetaHasTraits(ABCMeta, MetaHasTraits):
     pass
 
 
-class ABCDataflow(HasTraits, metaclass=_ABCMetaHasTraits):
+class ABCDataflow(HasTraits, Generic[FrameT], metaclass=_ABCMetaHasTraits):
     """
     Abstract base for dataflow implementations.
     Reference implementations: DataFlow and CustomizableDataflow.
     Other implementations (e.g., ColumnExecutorDataflow) should conform.
+
+    Generic on ``FrameT`` — the (unbounded) carrier type, so this umbrella
+    covers both the eager subtree (``DataFlow`` / ``CustomizableDataflow``,
+    which narrow to the ``DataFrameLike``-bound ``DataFrameT``) and the lazy
+    ``ColumnExecutorDataflow`` (``ABCDataflow[pl.LazyFrame]``). See
+    ``df_types`` for the ``FrameT`` / ``DataFrameT`` split.
     """
 
     # Baseline interface expected by Buckaroo widgets and extensions.
+    #
+    # The synced/computed wire attributes below are declared ``Any`` on
+    # purpose. Every concrete dataflow backs them with a traitlets trait
+    # (``Dict(...)`` / ``Any(...)``) or a ``@property``, and a traitlets
+    # descriptor returns ``Any`` on instance access. Declaring a concrete
+    # ``Dict[str, Any]`` here only fights the descriptor protocol: the
+    # subclass trait/property reads as an incompatible override, and a write
+    # (``self.summary_sd = ...``) reads as an illegal assignment. What these
+    # document is "this name exists on every dataflow"; each trait's actual
+    # shape lives with its definition. Same rationale as the frame-carrying
+    # methods in ``df_types`` — type where it's sound, not on the traits.
     analysis_klasses: List[Type[ColAnalysis]]
-    df_data_dict: Dict[str, Any]
-    df_display_args: Dict[str, Any]
-    df_meta: Dict[str, Any]
-    buckaroo_options: Dict[str, Any]
-    command_config: Dict[str, Any]
+    df_data_dict: Any
+    df_display_args: Any
+    df_meta: Any
+    buckaroo_options: Any
+    command_config: Any
     operations: Any
-    operation_results: Dict[str, Any]
-    summary_sd: Dict[str, Any]
-    cleaned_sd: Dict[str, Any]
-    processed_sd: Dict[str, Any]
-    merged_sd: Dict[str, Any]
+    operation_results: Any
+    summary_sd: Any
+    cleaned_sd: Any
+    processed_sd: Any
+    merged_sd: Any
     widget_args_tuple: Any
-    processed_df: Any
-
-    analysis_klasses: List[Type[ColAnalysis]]
+
+    @property
+    @abstractmethod
+    def processed_df(self) -> Optional[FrameT]:
+        """The fully-processed frame the widget renders, in the backend's
+        own type (``None`` before the pipeline has run, or for lazy
+        backends that never materialize).
+
+        A read-only computed property in every implementation — eager
+        backends derive it from ``processed_result``; the lazy
+        ``ColumnExecutorDataflow`` never materializes and returns ``None``.
+        Declared as a property (not a plain attribute) so those
+        ``@property`` overrides are a compatible, like-for-like override.
+        """
+        ...
 
     @abstractmethod
     def populate_df_meta(self) -> None:

diff --git a/buckaroo/dataflow/column_executor_dataflow.py b/buckaroo/dataflow/column_executor_dataflow.py
@@ -27,9 +27,15 @@
     from buckaroo.file_cache.batch_planning import PlanningFunction
 
 
-class ColumnExecutorDataflow(ABCDataflow):
+class ColumnExecutorDataflow(ABCDataflow[pl.LazyFrame]):
     """A minimal DataFlow focused on column-executor-driven summary stats for Polars LazyFrames.
 
+    Binds the abstract base's unbounded ``FrameT`` to ``pl.LazyFrame``. A
+    LazyFrame is never materialised, so it cannot meet the eager
+    ``DataFrameLike`` contract (no ``len`` / row-slice) — which is why this
+    class inherits ``ABCDataflow`` directly rather than the eager
+    ``CustomizableDataflow`` body, and supplies its own executor pipeline.
+
     - Works with a LazyFrame and avoids materializing the dataframe on load.
 
     - No-op command config, autocleaning, quick commands, and
@@ -364,7 +370,9 @@ def auto_compute_summary(self, sync_executor_class: Type[Executor], parallel_exe
                     # Don't re-raise, let it fail silently and use defaults
 
     @property
-    def processed_df(self) -> Any:
+    def processed_df(self) -> Optional[pl.LazyFrame]:
+        # Lazy: the frame is never materialised, so there is no processed
+        # frame to render. Always None (matches ABCDataflow's Optional[FrameT]).
         return None
 
     @observe('merged_sd')