Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ trop_avg_ref/

# Academic papers (local only, not for distribution)
papers/
paper/

# Local analysis notebooks (not committed)
analysis/
Expand Down
53 changes: 51 additions & 2 deletions diff_diff/prep_dgp.py
Original file line number Diff line number Diff line change
Expand Up @@ -1189,6 +1189,7 @@ def generate_survey_did_data(
return_true_population_att: bool = False,
covariate_effects: Optional[tuple] = None,
te_covariate_interaction: float = 0.0,
conditional_pt: float = 0.0,
) -> pd.DataFrame:
"""
Generate synthetic staggered DiD data with survey structure.
Expand Down Expand Up @@ -1289,8 +1290,9 @@ def generate_survey_did_data(
with keys: ``population_att`` (weight-weighted average of treated
true effects), ``deff_kish`` (1 + CV(w)^2), ``base_stratum_effects``
(base stratum TEs before dynamic/covariate modifiers),
``icc_realized`` (ANOVA-based
ICC computed on period-1 data).
``icc_realized`` (ANOVA-based ICC computed on period-1 data),
and ``conditional_pt_active`` (bool, whether conditional PT
regime is active).
covariate_effects : tuple of (float, float), optional
Coefficients ``(beta1, beta2)`` for covariates x1 and x2 in the
outcome equation ``y += beta1 * x1 + beta2 * x2``. Default uses
Expand All @@ -1301,6 +1303,21 @@ def generate_survey_did_data(
``TE_i = base_TE + te_covariate_interaction * x1_i``. Creates
unit-level treatment effect heterogeneity driven by the continuous
covariate. Requires ``add_covariates=True``.
conditional_pt : float, default=0.0
Coefficient for X-dependent time trend:
``y += conditional_pt * x1_i * (t / n_periods)``. When nonzero,
treated units' x1 is drawn from N(1, 1) instead of N(0, 1),
creating differential pre-trends correlated with covariates.
Conditional on x1, trends remain parallel (conditional PT holds).
DR/IPW estimators with covariates recover truth; no-covariate
estimators are biased. Uses normalized time (t/n_periods) for
scale independence. Requires ``add_covariates=True`` and at least
one ever-treated and one never-treated unit (the x1 mean shift
only differentiates ever-treated from never-treated units).

.. note:: When used with ``icc``, the ICC calibration is approximate
because the x1 mean shift creates a mixture distribution with
slightly higher marginal variance than the assumed Var(x1) = 1.

Returns
-------
Expand Down Expand Up @@ -1414,6 +1431,25 @@ def generate_survey_did_data(
if te_covariate_interaction != 0.0 and not add_covariates:
raise ValueError("te_covariate_interaction requires add_covariates=True")

if not np.isfinite(conditional_pt):
raise ValueError(
f"conditional_pt must be finite, got {conditional_pt}"
)
if conditional_pt != 0.0 and not add_covariates:
raise ValueError("conditional_pt requires add_covariates=True")
if conditional_pt != 0.0:
n_never = int(n_units * never_treated_frac)
n_treated = n_units - n_never
if n_never < 1 or n_treated < 1:
raise ValueError(
"conditional_pt requires at least one ever-treated and one "
f"never-treated unit (n_units={n_units}, "
f"never_treated_frac={never_treated_frac} yields "
f"{n_never} never-treated, {n_treated} treated). "
"The x1 mean shift differentiates ever-treated from "
"never-treated units; both groups must be present."
)

# --- ICC -> psu_re_sd resolution ---
if icc is not None:
# Covariate variance: Var(beta1*x1) + Var(beta2*x2)
Expand Down Expand Up @@ -1492,8 +1528,12 @@ def generate_survey_did_data(
y0_period1 = _panel_unit_fe + psu_re[unit_psu] + psu_period_re[unit_psu, 0] + 0.5
if add_covariates:
_panel_x1 = rng.normal(0, 1, size=n_units)
if conditional_pt != 0.0:
_panel_x1[unit_cohort > 0] += 1.0
_panel_x2 = rng.choice([0, 1], size=n_units)
y0_period1 = y0_period1 + _beta1 * _panel_x1 + _beta2 * _panel_x2
if conditional_pt != 0.0:
y0_period1 = y0_period1 + conditional_pt * _panel_x1 * (1 / n_periods)
_rank_pair_weights(unit_weight, unit_stratum, y0_period1, n_strata)

# Save base weights for cross-section informative sampling (reset each period)
Expand Down Expand Up @@ -1531,11 +1571,15 @@ def generate_survey_did_data(
# Draw covariates early so they can be included in Y(0) ranking
if add_covariates:
x1 = rng.normal(0, 1, size=n_units)
if conditional_pt != 0.0:
x1[unit_cohort > 0] += 1.0
x2 = rng.choice([0, 1], size=n_units)
unit_weight = _base_weight.copy() # type: ignore[possibly-undefined]
y0_t = unit_fe + psu_re[unit_psu] + psu_period_re[unit_psu, t - 1] + 0.5 * t
if add_covariates:
y0_t = y0_t + _beta1 * x1 + _beta2 * x2
if conditional_pt != 0.0:
y0_t = y0_t + conditional_pt * x1 * (t / n_periods)
_rank_pair_weights(unit_weight, unit_stratum, y0_t, n_strata)

# Covariates — may already be drawn by informative sampling above
Expand All @@ -1546,6 +1590,8 @@ def generate_survey_did_data(
pass # x1, x2 already drawn in cross-section ranking block
elif add_covariates:
x1 = rng.normal(0, 1, size=n_units)
if conditional_pt != 0.0:
x1[unit_cohort > 0] += 1.0
x2 = rng.choice([0, 1], size=n_units)
else:
x1 = None
Expand All @@ -1564,6 +1610,8 @@ def generate_survey_did_data(

if add_covariates:
y += _beta1 * x1[i] + _beta2 * x2[i]
if conditional_pt != 0.0:
y += conditional_pt * x1[i] * (t / n_periods)

treated = int(g_i > 0 and t >= g_i)
true_eff = 0.0
Expand Down Expand Up @@ -1663,6 +1711,7 @@ def generate_survey_did_data(
"deff_kish": float(deff_kish),
"base_stratum_effects": stratum_effects,
"icc_realized": icc_realized,
"conditional_pt_active": conditional_pt != 0.0,
}

return df
Expand Down
15 changes: 15 additions & 0 deletions docs/methodology/REGISTRY.md
Original file line number Diff line number Diff line change
Expand Up @@ -2700,6 +2700,21 @@ The 8-step workflow in `docs/llms-practitioner.txt` is adapted from Baker et al.
contributions are included in the Y(0) ranking used for weight assignment.
Covariates are pre-drawn before the ranking step (panel: once before the loop;
cross-section: each period) and reused in the outcome generation.
- **Note:** When `conditional_pt != 0`, the DGP creates X-dependent time trends
that violate unconditional parallel trends while preserving conditional PT.
Two mechanisms activate: (1) treated units' x1 is drawn from N(1, 1) instead
of N(0, 1), creating differential covariate distributions; (2) the outcome
includes `conditional_pt * x1_i * (t / n_periods)` for all units. Because
E[x1 | treated] != E[x1 | control], the average time trend differs by group
(unconditional PT fails). Conditional on x1, trends are identical (conditional
PT holds). DR/IPW estimators with x1 as covariate recover the true ATT.
Requires at least one ever-treated and one never-treated unit (rejected
otherwise because the x1 mean shift only differentiates ever-treated from
never-treated units).
- **Note:** When `conditional_pt != 0` is combined with `icc`, the ICC
calibration is approximate. The x1 mean shift creates a mixture distribution
with marginal Var(x1) = 1 + p_treated * (1 - p_treated) > 1, slightly
inflating non-PSU variance and causing realized ICC to undershoot the target.

---

Expand Down
16 changes: 8 additions & 8 deletions docs/survey-roadmap.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ Files: `benchmarks/R/benchmark_realdata_*.R`, `tests/test_survey_real_data.py`,
### Phase 10: Survey Completeness (v2.9.0–v3.0)

- **10a.** Survey theory document (`survey-theory.md`) — formal justification for design-based variance with modern DiD influence functions
- **10b.** Research-grade survey DGP — 8 new parameters on `generate_survey_did_data()`
- **10b.** Research-grade survey DGP — 9 parameters on `generate_survey_did_data()` (8 research-grade + `conditional_pt`)
- **10c.** R validation expansion — 8 of 16 estimators cross-validated against R's `survey::svyglm()`
- **10d.** Tutorial rewrite — flat-weight vs design-based comparison with known ground truth
- **10f.** WooldridgeDiD survey support — OLS, logit, Poisson paths with `pweight` + strata/PSU/FPC + TSL variance
Expand Down Expand Up @@ -164,10 +164,10 @@ Enhanced `generate_survey_did_data()` with 8 research-grade parameters:
`return_true_population_att`. All backward-compatible. Supports panel
and repeated cross-section modes.

**Remaining gap for 10e:** Conditional parallel trends — the DGP has
unconditional PT by construction. A `conditional_pt` parameter is needed
before the simulation study so that unconditional PT fails but conditional
PT holds after covariate adjustment (DR/IPW recovers truth).
**Resolved:** `conditional_pt` parameter added. When nonzero, shifts treated
units' x1 mean by +1 SD and adds `conditional_pt * x1_i * (t/T)` to the
outcome, creating X-dependent time trends. Unconditional PT fails; conditional
PT holds after covariate adjustment. DR/IPW estimators recover truth.

### 10c. Expand R Validation Coverage (HIGH priority) ✅

Expand Down Expand Up @@ -197,9 +197,9 @@ empirical illustration with NHANES ACA data (~3pp), software section
DR/IPW with covariates recovers truth; no-covariate estimator is biased.
This is the most novel claim — survey-weighted nuisance estimation
(propensity scores, outcome regression) produces valid IFs under complex
sampling. **Requires DGP extension**: add a `conditional_pt` parameter
to `generate_survey_did_data()` that makes the time trend
X-dependent (e.g., `trend_i = 0.5*t + delta * x1_i * t`).
sampling. **Resolved:** `conditional_pt` parameter added to
`generate_survey_did_data()` with X-dependent time trends
(`y += conditional_pt * x1_i * (t/T)`) and treated x1 mean shift.

**Co-authorship:** A co-author from the DiD methodology community would
strengthen credibility — someone who can vouch that the IFs are valid
Expand Down
Loading
Loading