diff --git a/inst/CompactLinearModel.m b/inst/CompactLinearModel.m new file mode 100644 index 00000000..22ed45cb --- /dev/null +++ b/inst/CompactLinearModel.m @@ -0,0 +1,786 @@ +## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com> +## +## This file is part of the statistics package for GNU Octave. +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +classdef CompactLinearModel < handle +## -*- texinfo -*- +## @deftp {statistics} CompactLinearModel +## +## Compact linear regression model. +## +## A @code{CompactLinearModel} object is a compact representation of a linear +## regression model. It contains the regression results (coefficient +## estimates, goodness-of-fit statistics, and model information) but does not +## contain the training data, residuals, or observation-level diagnostics. +## +## A @code{CompactLinearModel} object is created by calling @code{compact} on +## a @code{LinearModel} object. Users cannot construct a +## @code{CompactLinearModel} directly. +## +## @strong{Properties} +## +## All properties are read-only. +## +## @multitable @columnfractions 0.30 0.70 +## @headitem Property @tab Description +## @item @code{Coefficients} @tab Table of coefficient estimates, SE, tStat, +## and pValue. +## @item @code{CoefficientCovariance} @tab Variance-covariance matrix of the +## coefficient estimates. +## @item @code{CoefficientNames} @tab Cell array of coefficient names. +## @item @code{DFE} @tab Error degrees of freedom. +## @item @code{Formula} @tab Struct describing the model formula. +## @item @code{LogLikelihood} @tab Log-likelihood of the fitted model. +## @item @code{ModelCriterion} @tab Struct with AIC, AICc, BIC, CAIC. +## @item @code{MSE} @tab Mean squared error. +## @item @code{NumCoefficients} @tab Total number of coefficients. +## @item @code{NumEstimatedCoefficients} @tab Number of estimated (non-rank- +## deficient) coefficients. +## @item @code{NumObservations} @tab Number of observations used in fitting. +## @item @code{NumPredictors} @tab Number of predictor variables. +## @item @code{NumVariables} @tab Total number of variables. +## @item @code{PredictorNames} @tab Cell array of predictor variable names. +## @item @code{ResponseName} @tab Name of the response variable. +## @item @code{RMSE} @tab Root mean squared error. +## @item @code{Robust} @tab Robust fitting information (empty for OLS). +## @item @code{Rsquared} @tab Struct with Ordinary and Adjusted R-squared. +## @item @code{SSE} @tab Sum of squared errors. +## @item @code{SSR} @tab Regression sum of squares. +## @item @code{SST} @tab Total sum of squares. +## @item @code{VariableInfo} @tab Table of variable metadata. +## @item @code{VariableNames} @tab Cell array of all variable names. +## @end multitable +## +## @strong{Methods} +## +## @multitable @columnfractions 0.30 0.70 +## @headitem Method @tab Description +## @item @code{predict} @tab Predict response for new data. +## @item @code{coefCI} @tab Confidence intervals for coefficients. +## @item @code{coefTest} @tab Linear hypothesis test on coefficients. +## @end multitable +## +## @seealso{fitlm} +## @end deftp + + properties (SetAccess = protected) + + ## Coefficient estimates + Coefficients = []; + CoefficientNames = {}; + CoefficientCovariance = []; + NumCoefficients = 0; + NumEstimatedCoefficients = 0; + + ## Summary statistics + DFE = 0; + SSE = 0; + SSR = 0; + SST = 0; + MSE = 0; + RMSE = 0; + Rsquared = []; + LogLikelihood = 0; + ModelCriterion = []; + + ## Fitting method + Robust = []; + + ## Input data info (survives compact()) + Formula = []; + NumObservations = 0; + NumPredictors = 0; + NumVariables = 0; + PredictorNames = {}; + ResponseName = ''; + VariableInfo = []; + VariableNames = {}; + + endproperties + + ## Constructor — Access = protected so only LinearModel (subclass) or + ## internal factory methods can create CompactLinearModel objects. + ## When Octave supports (Access = ?LinearModel), switch to that. + methods (Access = protected) + + function obj = CompactLinearModel (s) + ## CompactLinearModel Constructor (internal use only). + ## + ## obj = CompactLinearModel (s) + ## + ## S is a struct containing all fields needed to populate the object. + ## This constructor is called by LinearModel.compact() or by fitlm's + ## internal builder. Users should not call it directly. + + if (nargin < 1) + ## Allow zero-arg construction for Octave classdef internals + return; + endif + + if (! isstruct (s)) + error ("CompactLinearModel: constructor argument must be a struct."); + endif + + ## Coefficient results + obj.Coefficients = s.Coefficients; + obj.CoefficientNames = s.CoefficientNames; + obj.CoefficientCovariance = s.CoefficientCovariance; + obj.NumCoefficients = s.NumCoefficients; + obj.NumEstimatedCoefficients = s.NumEstimatedCoefficients; + + ## Summary statistics + obj.DFE = s.DFE; + obj.SSE = s.SSE; + obj.SSR = s.SSR; + obj.SST = s.SST; + obj.MSE = s.MSE; + obj.RMSE = s.RMSE; + obj.Rsquared = s.Rsquared; + obj.LogLikelihood = s.LogLikelihood; + obj.ModelCriterion = s.ModelCriterion; + + ## Fitting method + obj.Robust = s.Robust; + + ## Formula and variable info + obj.Formula = s.Formula; + obj.NumObservations = s.NumObservations; + obj.NumPredictors = s.NumPredictors; + obj.NumVariables = s.NumVariables; + obj.PredictorNames = s.PredictorNames; + obj.ResponseName = s.ResponseName; + obj.VariableInfo = s.VariableInfo; + obj.VariableNames = s.VariableNames; + + endfunction + + endmethods + + ## Hidden methods: display, disp, subsref + methods (Hidden) + + ## Custom display — called when variable name is typed at prompt + function display (obj) + in_name = inputname (1); + if (! isempty (in_name)) + fprintf ("%s =\n", in_name); + endif + disp (obj); + endfunction + + ## Custom disp — replicates MATLAB's LinearModel display format + function disp (obj) + + fprintf ("\n"); + + ## Show "Compact" prefix for CompactLinearModel, not for LinearModel + if (strcmp (class (obj), "CompactLinearModel")) + fprintf ("Compact linear regression model:\n"); + else + fprintf ("Linear regression model:\n"); + endif + + ## Formula line: ResponseName ~ LinearPredictor + lp_str = ""; + if (isa (obj.Formula, "LinearFormula")) + lp_str = obj.Formula.LinearPredictor; + elseif (isstruct (obj.Formula) && isfield (obj.Formula, "LinearPredictor")) + lp_str = obj.Formula.LinearPredictor; + endif + if (! isempty (lp_str)) + if (! isempty (obj.ResponseName)) + fprintf (" %s ~ %s\n", obj.ResponseName, lp_str); + else + fprintf (" %s\n", lp_str); + endif + endif + + fprintf ("\n"); + fprintf ("Estimated Coefficients:\n"); + + ## --- Build coefficient table display --- + coef_names = obj.CoefficientNames; + p = numel (coef_names); + + ## Extract numeric vectors from Coefficients + if (isstruct (obj.Coefficients)) + est = obj.Coefficients.Estimate; + se = obj.Coefficients.SE; + ts = obj.Coefficients.tStat; + pv = obj.Coefficients.pValue; + elseif (isa (obj.Coefficients, "table")) + est = obj.Coefficients.Estimate; + se = obj.Coefficients.SE; + ts = obj.Coefficients.tStat; + pv = obj.Coefficients.pValue; + else + est = []; se = []; ts = []; pv = []; + endif + + if (! isempty (est)) + ## Column headers + hdr_fmt = " %-20s %12s %12s %12s %12s\n"; + fprintf (hdr_fmt, "", "Estimate", "SE", "tStat", "pValue"); + + ## Separator line + sep_fmt = " %-20s %12s %12s %12s %12s\n"; + fprintf (sep_fmt, "", "________", "________", "________", "__________"); + fprintf ("\n"); + + ## Data rows + for i = 1:p + row_fmt = " %-20s %12.6g %12.6g %12.6g %12.4g\n"; + fprintf (row_fmt, coef_names{i}, est(i), se(i), ts(i), pv(i)); + endfor + endif + + fprintf ("\n"); + + ## Summary footer + fprintf ("Number of observations: %d, Error degrees of freedom: %d\n", ... + obj.NumObservations, obj.DFE); + fprintf ("Root Mean Squared Error: %.4g\n", obj.RMSE); + + if (isstruct (obj.Rsquared)) + fprintf ("R-squared: %.4g, Adjusted R-Squared: %.4g\n", ... + obj.Rsquared.Ordinary, obj.Rsquared.Adjusted); + endif + + ## F-statistic vs constant model + ## F = (SSR / (NumEstimatedCoefficients - 1)) / MSE + if (obj.NumEstimatedCoefficients > 1 && obj.MSE > 0) + df1 = obj.NumEstimatedCoefficients - 1; + F_val = (obj.SSR / df1) / obj.MSE; + F_pval = 1 - fcdf (F_val, df1, obj.DFE); + fprintf ("F-statistic vs. constant model: %.4g, p-value = %.4g\n", ... + F_val, F_pval); + endif + + fprintf ("\n"); + + endfunction + + ## Custom subsref — supports chained dot indexing for properties + function varargout = subsref (obj, s) + chain_s = s(2:end); + s = s(1); + switch (s.type) + case "()" + error (strcat ("Invalid () indexing for referencing values", ... + " in a CompactLinearModel object.")); + case "{}" + error (strcat ("Invalid {} indexing for referencing values", ... + " in a CompactLinearModel object.")); + case "." + if (! ischar (s.subs)) + error (strcat ("CompactLinearModel.subsref: '.'", ... + " indexing argument must be a character vector.")); + endif + ## Check if it's a method call + switch (s.subs) + case "predict" + if (! isempty (chain_s) && strcmp (chain_s(1).type, "()")) + args = chain_s(1).subs; + chain_s = chain_s(2:end); + [varargout{1:nargout}] = predict (obj, args{:}); + else + error ("CompactLinearModel: predict requires arguments."); + endif + if (! isempty (chain_s)) + [varargout{1:nargout}] = subsref (varargout{1}, chain_s); + endif + return; + case "coefCI" + if (! isempty (chain_s) && strcmp (chain_s(1).type, "()")) + args = chain_s(1).subs; + chain_s = chain_s(2:end); + [varargout{1:nargout}] = coefCI (obj, args{:}); + else + [varargout{1:nargout}] = coefCI (obj); + endif + if (! isempty (chain_s)) + [varargout{1:nargout}] = subsref (varargout{1}, chain_s); + endif + return; + case "coefTest" + if (! isempty (chain_s) && strcmp (chain_s(1).type, "()")) + args = chain_s(1).subs; + chain_s = chain_s(2:end); + [varargout{1:nargout}] = coefTest (obj, args{:}); + else + [varargout{1:nargout}] = coefTest (obj); + endif + if (! isempty (chain_s)) + [varargout{1:nargout}] = subsref (varargout{1}, chain_s); + endif + return; + case "compact" + [varargout{1:nargout}] = compact (obj); + if (! isempty (chain_s)) + [varargout{1:nargout}] = subsref (varargout{1}, chain_s); + endif + return; + case "addTerms" + if (! isempty (chain_s) && strcmp (chain_s(1).type, "()")) + args = chain_s(1).subs; + chain_s = chain_s(2:end); + [varargout{1:nargout}] = addTerms (obj, args{:}); + else + error ("CompactLinearModel: addTerms requires arguments."); + endif + if (! isempty (chain_s)) + [varargout{1:nargout}] = subsref (varargout{1}, chain_s); + endif + return; + case "removeTerms" + if (! isempty (chain_s) && strcmp (chain_s(1).type, "()")) + args = chain_s(1).subs; + chain_s = chain_s(2:end); + [varargout{1:nargout}] = removeTerms (obj, args{:}); + else + error ("CompactLinearModel: removeTerms requires arguments."); + endif + if (! isempty (chain_s)) + [varargout{1:nargout}] = subsref (varargout{1}, chain_s); + endif + return; + case "step" + if (! isempty (chain_s) && strcmp (chain_s(1).type, "()")) + args = chain_s(1).subs; + chain_s = chain_s(2:end); + [varargout{1:nargout}] = step (obj, args{:}); + else + [varargout{1:nargout}] = step (obj); + endif + if (! isempty (chain_s)) + [varargout{1:nargout}] = subsref (varargout{1}, chain_s); + endif + return; + case "anova" + if (! isempty (chain_s) && strcmp (chain_s(1).type, "()")) + args = chain_s(1).subs; + chain_s = chain_s(2:end); + [varargout{1:nargout}] = anova (obj, args{:}); + else + [varargout{1:nargout}] = anova (obj); + endif + if (! isempty (chain_s)) + [varargout{1:nargout}] = subsref (varargout{1}, chain_s); + endif + return; + case "feval" + if (! isempty (chain_s) && strcmp (chain_s(1).type, "()")) + args = chain_s(1).subs; + chain_s = chain_s(2:end); + [varargout{1:nargout}] = predict (obj, args{:}); + else + error ("CompactLinearModel: feval requires arguments."); + endif + if (! isempty (chain_s)) + [varargout{1:nargout}] = subsref (varargout{1}, chain_s); + endif + return; + case "disp" + disp (obj); + return; + endswitch + ## Property access + try + out = obj.(s.subs); + catch + error ("CompactLinearModel.subsref: unrecognized property: '%s'.", ... + s.subs); + end_try_catch + endswitch + ## Chained references + if (! isempty (chain_s)) + out = subsref (out, chain_s); + endif + varargout{1} = out; + endfunction + + ## Prevent direct property assignment by users + function obj = subsasgn (obj, s, val) + if (numel (s) > 1) + error (strcat ("CompactLinearModel.subsasgn:", ... + " chained subscripts not allowed.")); + endif + switch (s.type) + case "()" + error (strcat ("Invalid () indexing for assigning values", ... + " to a CompactLinearModel object.")); + case "{}" + error (strcat ("Invalid {} indexing for assigning values", ... + " to a CompactLinearModel object.")); + case "." + error (strcat ("CompactLinearModel.subsasgn:", ... + " all properties are read-only: '%s'."), s.subs); + endswitch + endfunction + + endmethods + + ## Public methods: predict, coefCI, coefTest + methods (Access = public) + + ## -*- texinfo -*- + ## @deftypefn {CompactLinearModel} {@var{ypred} =} predict (@var{obj}, @var{Xnew}) + ## @deftypefnx {CompactLinearModel} {[@var{ypred}, @var{yci}] =} predict (@var{obj}, @var{Xnew}) + ## @deftypefnx {CompactLinearModel} {[@var{ypred}, @var{yci}] =} predict (@dots{}, @var{Name}, @var{Value}) + ## + ## Predict response for new data using a compact linear regression model. + ## + ## @code{@var{ypred} = predict (@var{obj}, @var{Xnew})} returns the predicted + ## response values for the predictor data in @var{Xnew} using the fitted + ## model @var{obj}. @var{Xnew} must be a numeric matrix with the same + ## number of predictor columns as used to fit the model (excluding + ## intercept — it is added automatically if the model has one). + ## + ## @code{[@var{ypred}, @var{yci}] = predict (@dots{})} also returns + ## confidence or prediction intervals in the @math{n x 2} matrix @var{yci}. + ## + ## Additional options can be specified as @qcode{Name-Value} pairs: + ## + ## @table @code + ## @item Alpha + ## Significance level for the confidence/prediction intervals. + ## Default is 0.05 (95% intervals). + ## + ## @item Prediction + ## Type of interval: @qcode{'curve'} for confidence intervals on the + ## mean response (default), or @qcode{'observation'} for prediction + ## intervals on individual new observations. + ## + ## @item Simultaneous + ## If @code{true}, compute simultaneous Scheffé confidence bands. + ## Default is @code{false} (pointwise intervals). + ## @end table + ## + ## @seealso{CompactLinearModel, coefCI, coefTest} + ## @end deftypefn + + function [ypred, yci] = predict (obj, Xnew, varargin) + + if (nargin < 2) + error ("CompactLinearModel.predict: Xnew is required."); + endif + + ## Parse name-value pairs + alpha = 0.05; + pred_obs = false; ## false = curve (CI on mean), true = observation (PI) + simult = false; + + i = 1; + while (i <= numel (varargin)) + if (! ischar (varargin{i})) + error ("CompactLinearModel.predict: expected parameter name string."); + endif + switch (lower (varargin{i})) + case "alpha" + alpha = varargin{i+1}; + i = i + 2; + case "prediction" + pred_obs = strcmpi (varargin{i+1}, "observation"); + i = i + 2; + case "simultaneous" + simult = varargin{i+1}; + i = i + 2; + otherwise + error ("CompactLinearModel.predict: unknown parameter '%s'.", ... + varargin{i}); + endswitch + endwhile + + ## Determine intercept flag + has_int = false; + if (isa (obj.Formula, "LinearFormula")) + has_int = obj.Formula.HasIntercept; + elseif (isstruct (obj.Formula) && isfield (obj.Formula, "HasIntercept")) + has_int = obj.Formula.HasIntercept; + endif + + ## Unpack table input into numeric design matrix + if (isa (Xnew, "table")) + Xnew = __predict_table_to_Xmat__ (obj, Xnew, has_int); + ## Xnew is now the full design matrix (with intercept if applicable) + Xmat = Xnew; + else + ## Standard numeric matrix path + if (has_int) + Xmat = [ones(rows (Xnew), 1), Xnew]; + else + Xmat = Xnew; + endif + endif + + ## Check dimensions + if (columns (Xmat) != obj.NumCoefficients) + error (strcat ("CompactLinearModel.predict: Xnew has wrong", ... + " number of columns. Expected %d predictor columns."), ... + obj.NumCoefficients - double (obj.Formula.HasIntercept)); + endif + + ## Extract coefficient estimates + if (isa (obj.Coefficients, "table")) + beta = obj.Coefficients.Estimate; + elseif (isstruct (obj.Coefficients)) + beta = obj.Coefficients.Estimate; + else + error ("CompactLinearModel.predict: invalid Coefficients format."); + endif + + ## Point predictions + ypred = Xmat * beta; + + ## Compute intervals if requested + if (nargout > 1) + se_fit = sqrt (sum ((Xmat * obj.CoefficientCovariance) .* Xmat, 2)); + + if (pred_obs) + se_pred = sqrt (se_fit .^ 2 + obj.MSE); + else + se_pred = se_fit; + endif + + if (simult) + p_eff = obj.NumEstimatedCoefficients; + F_crit = finv (1 - alpha, p_eff, obj.DFE); + hw = sqrt (p_eff * F_crit) .* se_pred; + else + hw = tinv (1 - alpha / 2, obj.DFE) .* se_pred; + endif + + yci = [ypred - hw, ypred + hw]; + endif + + endfunction + + ## -*- texinfo -*- + ## @deftypefn {CompactLinearModel} {@var{ci} =} coefCI (@var{obj}) + ## @deftypefnx {CompactLinearModel} {@var{ci} =} coefCI (@var{obj}, @var{alpha}) + ## + ## Confidence intervals for coefficient estimates. + ## + ## @code{@var{ci} = coefCI (@var{obj})} returns a @math{p x 2} matrix + ## of 95% confidence intervals for the coefficient estimates, where + ## @math{p} is the number of coefficients. + ## + ## @code{@var{ci} = coefCI (@var{obj}, @var{alpha})} specifies the + ## significance level. For example, @code{alpha = 0.01} gives 99% + ## confidence intervals. + ## + ## @seealso{CompactLinearModel, predict, coefTest} + ## @end deftypefn + + function ci = coefCI (obj, alpha) + + if (nargin < 2) + alpha = 0.05; + endif + + t_crit = tinv (1 - alpha / 2, obj.DFE); + + ## Extract estimates and SEs + if (isa (obj.Coefficients, "table")) + est = obj.Coefficients.Estimate; + se = obj.Coefficients.SE; + elseif (isstruct (obj.Coefficients)) + est = obj.Coefficients.Estimate; + se = obj.Coefficients.SE; + else + error ("CompactLinearModel.coefCI: invalid Coefficients format."); + endif + + ci = [est - t_crit * se, est + t_crit * se]; + + endfunction + + ## -*- texinfo -*- + ## @deftypefn {CompactLinearModel} {[@var{pval}, @var{F}, @var{r}] =} coefTest (@var{obj}) + ## @deftypefnx {CompactLinearModel} {[@var{pval}, @var{F}, @var{r}] =} coefTest (@var{obj}, @var{H}) + ## @deftypefnx {CompactLinearModel} {[@var{pval}, @var{F}, @var{r}] =} coefTest (@var{obj}, @var{H}, @var{c}) + ## + ## Linear hypothesis test on coefficients. + ## + ## @code{[@var{pval}, @var{F}, @var{r}] = coefTest (@var{obj})} performs + ## the overall F-test: tests whether all non-intercept coefficients are + ## simultaneously zero. The F-statistic matches the "F-statistic vs. + ## constant model" shown by @code{disp}. + ## + ## @code{[@var{pval}, @var{F}, @var{r}] = coefTest (@var{obj}, @var{H})} + ## tests the hypothesis @code{@var{H} * beta = 0}, where @var{H} is a + ## contrast matrix. + ## + ## @code{[@var{pval}, @var{F}, @var{r}] = coefTest (@var{obj}, @var{H}, + ## @var{c})} tests the hypothesis @code{@var{H} * beta = @var{c}}. + ## + ## The output @var{r} is the numerator degrees of freedom for the + ## F-test (equal to the number of rows in @var{H}). The F-statistic + ## has @var{r} degrees of freedom in the numerator and @code{obj.DFE} + ## degrees of freedom in the denominator. + ## + ## @seealso{CompactLinearModel, predict, coefCI} + ## @end deftypefn + + function [pval, F, r] = coefTest (obj, H, c) + + ## Extract beta + if (isa (obj.Coefficients, "table")) + beta = obj.Coefficients.Estimate; + elseif (isstruct (obj.Coefficients)) + beta = obj.Coefficients.Estimate; + else + error ("CompactLinearModel.coefTest: invalid Coefficients format."); + endif + + p = obj.NumCoefficients; + + ## Default H: overall F-test (all non-intercept = 0) + if (nargin < 2 || isempty (H)) + ## H = [zeros(p-1, 1), eye(p-1)] — tests all columns except intercept + H = [zeros(p - 1, 1), eye(p - 1)]; + endif + + ## Default c: zero vector + if (nargin < 3 || isempty (c)) + c = zeros (rows (H), 1); + endif + + ## Contrast vector (used internally for F computation) + contrast = H * beta - c; + + ## Wald F-test + V = H * obj.CoefficientCovariance * H'; + df1 = rows (H); + F = (contrast' * (V \ contrast)) / df1; + pval = 1 - fcdf (F, df1, obj.DFE); + + ## r = numerator degrees of freedom (= rows of H) + r = df1; + + endfunction + + function [varargout] = feval (obj, varargin) + ## feval Evaluate model (alias for predict). + ## + ## ypred = feval (mdl, Xnew) + ## [ypred, yci] = feval (mdl, Xnew, ...) + ## + ## This method enables MATLAB-compatible feval(mdl, Xnew) syntax. + + [varargout{1:nargout}] = predict (obj, varargin{:}); + endfunction + + endmethods + + methods (Access = private) + + function Xmat = __predict_table_to_Xmat__ (obj, tbl_new, has_int) + ## __predict_table_to_Xmat__ Convert a table of predictors to design matrix. + ## + ## Extracts columns matching PredictorNames, expands categoricals + ## into dummy columns using the same reference coding as the fitted + ## model, and prepends an intercept column if needed. + + pred_names = obj.PredictorNames; + p = numel (pred_names); + n_new = rows (tbl_new); + + ## Get the CoefficientNames to determine expected dummy structure + coef_names = obj.CoefficientNames; + + ## Determine which predictors are actually in the model + ## Use InModel from Formula if available + in_model = true (1, p); + if (isa (obj.Formula, "LinearFormula") && ! isempty (obj.Formula.InModel)) + im = obj.Formula.InModel; + in_model = im(1:min(p, numel(im))); + elseif (isstruct (obj.Formula) && isfield (obj.Formula, "InModel")) + im = obj.Formula.InModel; + in_model = im(1:min(p, numel(im))); + endif + + ## Build columns for each predictor that is IN the model + Xcols = []; + for k = 1:p + if (! in_model(k)) + continue; + endif + pname = pred_names{k}; + + ## Extract column from table + try + col_data = tbl_new.(pname); + catch + error ("CompactLinearModel.predict: table is missing predictor column '%s'.", ... + pname); + end_try_catch + + if (isa (col_data, "categorical")) + ## Categorical: expand into dummy columns using reference coding + levs = cellstr (categories (col_data)); + col_str = cellstr (col_data); + [~, cat_idx] = ismember (col_str, levs); + n_lev = numel (levs); + + ## Determine which levels the model expects (from CoefficientNames) + ## Look for patterns like PredName_LevelName in coef_names + dum_cols = []; + dum_found = false; + for L = 1:n_lev + expected_name = sprintf ("%s_%s", pname, levs{L}); + if (any (strcmp (coef_names, expected_name))) + dum_found = true; + dum_cols = [dum_cols, double(cat_idx == L)]; + endif + endfor + + if (! dum_found) + ## Fallback: reference coding (drop first level) + for L = 2:n_lev + dum_cols = [dum_cols, double(cat_idx == L)]; + endfor + endif + + Xcols = [Xcols, dum_cols]; + else + ## Numeric predictor + Xcols = [Xcols, double(col_data)]; + endif + endfor + + ## Prepend intercept + if (has_int) + Xmat = [ones(n_new, 1), Xcols]; + else + Xmat = Xcols; + endif + endfunction + + endmethods + +endclassdef + +## Test: CompactLinearModel cannot be constructed directly (protected access) +## This should error since the constructor is protected. +%!error CompactLinearModel (struct ()) + +## Test: verify class definition parses without error +## (This test just confirms the file is syntactically valid) +%!test +%! ## Check that the class exists and is loadable +%! assert (exist ("CompactLinearModel", "class") > 0 || ... +%! exist ("CompactLinearModel") > 0); diff --git a/inst/LinearFormula.m b/inst/LinearFormula.m new file mode 100644 index 00000000..c29e0ae6 --- /dev/null +++ b/inst/LinearFormula.m @@ -0,0 +1,122 @@ +## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com> +## +## This file is part of the statistics package for GNU Octave. +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +classdef LinearFormula +## -*- texinfo -*- +## @deftp {statistics} LinearFormula +## +## Linear formula encapsulation class. +## +## A @code{LinearFormula} object encapsulates the internal formula +## representation for a @code{LinearModel} or @code{CompactLinearModel}. +## When displayed in the console, it shows only the human-readable +## formula string (e.g., @code{y ~ 1 + x1 + x2}). +## +## @strong{Properties} +## +## @multitable @columnfractions 0.30 0.70 +## @headitem Property @tab Description +## @item @code{ResponseName} @tab Name of the response variable. +## @item @code{LinearPredictor} @tab Human-readable formula string. +## @item @code{Terms} @tab Binary terms matrix (n_terms x p). +## @item @code{HasIntercept} @tab Logical: true if model has intercept. +## @item @code{InModel} @tab Logical vector indicating which terms are +## active. +## @item @code{CoefficientNames} @tab Cell array of coefficient names. +## @item @code{PredictorNames} @tab Cell array of predictor names. +## @item @code{VariableNames} @tab Cell array of all variable names. +## @end multitable +## +## @seealso{fitlm, LinearModel, CompactLinearModel} +## @end deftp + + properties (SetAccess = protected) + ResponseName = ""; + LinearPredictor = ""; + Terms = []; + HasIntercept = true; + InModel = []; + CoefficientNames = {}; + PredictorNames = {}; + VariableNames = {}; + endproperties + + methods + + function obj = LinearFormula (s) + ## LinearFormula Constructor. + ## + ## obj = LinearFormula (s) + ## + ## S is a struct with formula fields. If S is already a + ## LinearFormula, it is returned as-is. + + if (nargin < 1) + return; + endif + + if (isa (s, "LinearFormula")) + obj = s; + return; + endif + + if (! isstruct (s)) + error ("LinearFormula: constructor argument must be a struct."); + endif + + if (isfield (s, "ResponseName")) + obj.ResponseName = s.ResponseName; + endif + if (isfield (s, "LinearPredictor")) + obj.LinearPredictor = s.LinearPredictor; + endif + if (isfield (s, "Terms")) + obj.Terms = s.Terms; + endif + if (isfield (s, "HasIntercept")) + obj.HasIntercept = s.HasIntercept; + endif + if (isfield (s, "InModel")) + obj.InModel = s.InModel; + endif + if (isfield (s, "CoefficientNames")) + obj.CoefficientNames = s.CoefficientNames; + endif + if (isfield (s, "PredictorNames")) + obj.PredictorNames = s.PredictorNames; + endif + if (isfield (s, "VariableNames")) + obj.VariableNames = s.VariableNames; + endif + endfunction + + function disp (obj) + ## disp Display only the formula string. + ## + ## Matches MATLAB's LinearFormula display behavior: shows only + ## the human-readable formula (e.g., "y ~ 1 + x1 + x2"). + fprintf (" %s ~ %s\n", obj.ResponseName, obj.LinearPredictor); + endfunction + + function s = char (obj) + ## char Convert to character representation. + s = sprintf ("%s ~ %s", obj.ResponseName, obj.LinearPredictor); + endfunction + + endmethods + +endclassdef diff --git a/inst/LinearModel.m b/inst/LinearModel.m new file mode 100644 index 00000000..93713c36 --- /dev/null +++ b/inst/LinearModel.m @@ -0,0 +1,1304 @@ +## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com> +## +## This file is part of the statistics package for GNU Octave. +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +classdef LinearModel < CompactLinearModel +## -*- texinfo -*- +## @deftp {statistics} LinearModel +## +## Linear regression model. +## +## A @code{LinearModel} object encapsulates the results of fitting a linear +## regression model. It inherits from @code{CompactLinearModel} and adds +## the source training data, observation-level residuals and diagnostics, +## and model modification methods. +## +## A @code{LinearModel} object is created by @code{fitlm} or +## @code{stepwiselm}. Users do not call the constructor directly. +## +## @strong{Additional Properties (beyond CompactLinearModel)} +## +## @multitable @columnfractions 0.25 0.75 +## @headitem Property @tab Description +## @item @code{Variables} @tab Original input data (all rows, including excluded). +## @item @code{ObservationNames} @tab Cell array of row names. +## @item @code{ObservationInfo} @tab Table with columns Weights, Excluded, Missing, Subset. +## @item @code{Fitted} @tab Fitted values for all observations. +## @item @code{Residuals} @tab Table with columns Raw, Pearson, Studentized, Standardized. +## @item @code{Diagnostics} @tab Table with columns Leverage, CooksDistance, Dffits, S2_i, CovRatio, Dfbetas, HatMatrix. +## @item @code{ModelFitVsNullModel} @tab Struct with fields Fstat, Pvalue, NullModel. +## @item @code{Steps} @tab Stepwise fitting history (empty for fitlm). +## @end multitable +## +## @strong{Methods} +## +## @multitable @columnfractions 0.25 0.75 +## @headitem Method @tab Description +## @item @code{compact} @tab Strip to CompactLinearModel. +## @item @code{addTerms} @tab Add terms and refit. +## @item @code{removeTerms} @tab Remove terms and refit. +## @item @code{step} @tab Single optimal stepwise step. +## @item @code{anova} @tab ANOVA table (partial SS). +## @end multitable +## +## @seealso{fitlm, CompactLinearModel} +## @end deftp + + properties (SetAccess = protected) + ## Source data (not on CompactLinearModel) + Variables = []; + ObservationNames = {}; + ObservationInfo = []; + + ## Diagnostics (not on CompactLinearModel) + Fitted = []; + Residuals = []; + Diagnostics = []; + ModelFitVsNullModel = []; + + ## Fitting method + Steps = []; + endproperties + + + methods (Access = protected) + + function obj = LinearModel (s) + ## LinearModel Constructor (internal use only). + ## + ## obj = LinearModel (s) + ## + ## S is a struct with two groups of fields: + ## - All CompactLinearModel fields (passed to parent constructor) + ## - LinearModel-only fields: Variables, ObservationNames, + ## ObservationInfo, Fitted, Residuals, Diagnostics, + ## ModelFitVsNullModel, Steps + + if (nargin < 1) + ## Allow zero-arg construction for Octave classdef internals + return; + endif + + if (!isstruct(s)) + error("LinearModel: constructor argument must be a struct."); + endif + + ## Call parent constructor with the CompactLinearModel fields + obj = obj@CompactLinearModel (s); + + ## Assign LinearModel-only properties + if (isfield (s, "Variables")) + obj.Variables = s.Variables; + endif + if (isfield (s, "ObservationNames")) + obj.ObservationNames = s.ObservationNames; + endif + if (isfield (s, "ObservationInfo")) + obj.ObservationInfo = s.ObservationInfo; + endif + if (isfield (s, "Fitted")) + obj.Fitted = s.Fitted; + endif + if (isfield (s, "Residuals")) + obj.Residuals = s.Residuals; + endif + if (isfield (s, "Diagnostics")) + obj.Diagnostics = s.Diagnostics; + endif + if (isfield (s, "ModelFitVsNullModel")) + obj.ModelFitVsNullModel = s.ModelFitVsNullModel; + endif + if (isfield (s, "Steps")) + obj.Steps = s.Steps; + endif + + endfunction + + endmethods + + methods (Static, Access = public) + + function obj = fromFit (fit_s, X, y, formula, pred_names, resp_name, ... + var_names, var_info, obs_names, obs_info, ... + weights, excluded, missing_mask, steps, ... + variables_data, X_full) + ## fromFit Build a LinearModel from lm_fit_engine output. + ## + ## This static factory method takes the raw lm_fit_engine struct and + ## all metadata, computes observation-level diagnostics, and returns + ## a fully-populated LinearModel object. + ## + ## Parameters: + ## fit_s - struct from lm_fit_engine(X_used, y_used, w_used) + ## X - design matrix for USED (subset) observations only + ## y - response vector for USED observations only + ## formula - Formula struct + ## pred_names - cell of predictor names + ## resp_name - response variable name (char) + ## var_names - cell of all variable names + ## var_info - variable info struct/table + ## obs_names - cell of observation names (or {}) + ## obs_info - struct with Weights, Excluded, Missing, Subset + ## weights - original weights vector (n_total x 1, or []) + ## excluded - logical n_total x 1 excluded mask + ## missing_mask - logical n_total x 1 missing mask + ## steps - Steps struct ([] for fitlm) + ## variables_data - Variables struct/table (source data) + + n_total = numel (excluded); + n_used = fit_s.n; + subset = obs_info.Subset; + + ## Compute Fitted for ALL observations (including excluded/missing) + fitted_all = NaN (n_total, 1); + fitted_all(subset) = fit_s.yhat; + ## Excluded-but-not-missing rows: compute yhat = X_full * beta (Phase 4) + ## MATLAB Block 7: excluded rows DO get Fitted values (X * beta extrapolation) + if (nargin >= 16 && ! isempty (X_full)) + excl_not_missing = excluded & ! missing_mask; + if (any (excl_not_missing)) + fitted_all(excl_not_missing) = X_full(excl_not_missing, :) * fit_s.beta; + endif + endif + + ## Compute diagnostics for used observations + h = fit_s.h; + raw = fit_s.resid; + rmse = fit_s.rmse; + mse = fit_s.mse; + sse = fit_s.sse; + dfe = fit_s.dfe; + p_est = fit_s.p_est; + + ## Residual types (for used observations only) + pearson = raw / rmse; + standardized = raw ./ (rmse * sqrt (1 - h)); + S2_i_used = (sse - raw .^ 2 ./ (1 - h)) / (dfe - 1); + studentized = raw ./ (sqrt (S2_i_used) .* sqrt (1 - h)); + + ## Expand residuals to all observations (NaN for excluded/missing) + raw_all = NaN (n_total, 1); + pearson_all = NaN (n_total, 1); + studentized_all = NaN (n_total, 1); + standardized_all = NaN (n_total, 1); + + raw_all(subset) = raw; + pearson_all(subset) = pearson; + studentized_all(subset) = studentized; + standardized_all(subset) = standardized; + + ## Build Residuals table (MATLAB returns n x 4 table) + if (! isempty (obs_names)) + resid_tbl = table (raw_all, pearson_all, studentized_all, ... + standardized_all, ... + "VariableNames", {"Raw", "Pearson", ... + "Studentized", "Standardized"}, ... + "RowNames", obs_names(:)'); + else + resid_tbl = table (raw_all, pearson_all, studentized_all, ... + standardized_all, ... + "VariableNames", {"Raw", "Pearson", ... + "Studentized", "Standardized"}); + endif + + ## Diagnostics for used observations + CooksDistance_used = (1 / p_est) * (h ./ (1 - h)) .* standardized .^ 2; + Dffits_used = studentized .* sqrt (h ./ (1 - h)); + ## CovRatio: Candidate 3 formula, verified to 3.55e-14 against MATLAB + CovRatio_used = (S2_i_used / mse) .^ p_est ./ (1 - h); + + ## Hat matrix from QR factor (n_used x n_used) + HatMatrix_full = NaN (n_total, n_total); + Q_est = fit_s.Q; + H_used = Q_est * Q_est'; + ## Place into full matrix + idx_used = find (subset); + HatMatrix_full(idx_used, idx_used) = H_used; + ## Excluded rows: set to 0 + idx_excl = find (! subset); + HatMatrix_full(idx_excl, :) = 0; + HatMatrix_full(:, idx_excl) = 0; + + ## Leverage for all observations + leverage_all = zeros (n_total, 1); + leverage_all(subset) = h; + + ## CooksDistance, Dffits, S2_i, CovRatio for all obs + CooksDistance_all = NaN (n_total, 1); + Dffits_all = NaN (n_total, 1); + S2_i_all = NaN (n_total, 1); + CovRatio_all = NaN (n_total, 1); + + CooksDistance_all(subset) = CooksDistance_used; + Dffits_all(subset) = Dffits_used; + S2_i_all(subset) = S2_i_used; + CovRatio_all(subset) = CovRatio_used; + + ## Dfbetas (n_total x p_tot) via update formula + p_tot = fit_s.p_tot; + Dfbetas_all = NaN (n_total, p_tot); + vcov = fit_s.vcov; + se_coef = sqrt (diag (vcov)); + for i = 1:n_used + ii = idx_used(i); + x_i = X(i, :)'; + e_i = raw(i); + h_i = h(i); + s2_i = S2_i_used(i); + if (s2_i > 0 && (1 - h_i) > 0) + Dfbetas_all(ii, :) = ... + ((vcov * x_i * e_i) / ((1 - h_i) * sqrt (s2_i))) ./ se_coef; + endif + endfor + + ## Build Diagnostics table (MATLAB returns n x 7 table) + if (! isempty (obs_names)) + diag_tbl = table (leverage_all, CooksDistance_all, Dffits_all, ... + S2_i_all, CovRatio_all, Dfbetas_all, ... + HatMatrix_full, ... + "VariableNames", {"Leverage", "CooksDistance", ... + "Dffits", "S2_i", "CovRatio", ... + "Dfbetas", "HatMatrix"}, ... + "RowNames", obs_names(:)'); + else + diag_tbl = table (leverage_all, CooksDistance_all, Dffits_all, ... + S2_i_all, CovRatio_all, Dfbetas_all, ... + HatMatrix_full, ... + "VariableNames", {"Leverage", "CooksDistance", ... + "Dffits", "S2_i", "CovRatio", ... + "Dfbetas", "HatMatrix"}); + endif + + ## ModelFitVsNullModel + if (p_est > 1 && mse > 0) + df_reg = p_est - 1; + F_null = (fit_s.ssr / df_reg) / mse; + p_null = 1 - fcdf (F_null, df_reg, dfe); + else + F_null = NaN; + p_null = NaN; + endif + mfvnm = struct ("Fstat", F_null, "Pvalue", p_null, ... + "NullModel", "constant"); + + ## Build struct for parent + child constructor + s = struct (); + + ## CompactLinearModel fields + s.Coefficients = table (fit_s.beta, fit_s.se, ... + fit_s.tstat, fit_s.pval, ... + "VariableNames", {"Estimate", "SE", "tStat", "pValue"}, ... + "RowNames", formula.CoefficientNames(:)'); + s.CoefficientNames = formula.CoefficientNames; + s.CoefficientCovariance = fit_s.vcov; + s.NumCoefficients = fit_s.p_tot; + s.NumEstimatedCoefficients = fit_s.p_est; + s.DFE = fit_s.dfe; + s.SSE = fit_s.sse; + s.SSR = fit_s.ssr; + s.SST = fit_s.sst; + s.MSE = fit_s.mse; + s.RMSE = fit_s.rmse; + s.Rsquared = fit_s.rsq; + s.LogLikelihood = fit_s.logL; + s.ModelCriterion = fit_s.crit; + s.Robust = []; + s.Formula = LinearFormula (formula); + s.NumObservations = n_used; + s.NumPredictors = numel (pred_names); + s.NumVariables = numel (var_names); + s.PredictorNames = pred_names; + s.ResponseName = resp_name; + ## Wrap VariableInfo as table if it is still a raw struct + if (isstruct (var_info) && ! isempty (var_info)) + s.VariableInfo = table (var_info.Class, var_info.Range, ... + var_info.InModel, var_info.IsCategorical, ... + "VariableNames", {"Class", "Range", ... + "InModel", "IsCategorical"}, ... + "RowNames", var_names(:)'); + else + s.VariableInfo = var_info; + endif + s.VariableNames = var_names; + + ## LinearModel-only fields + if (nargin >= 15 && ! isempty (variables_data)) + s.Variables = variables_data; + else + s.Variables = []; + endif + s.ObservationNames = obs_names; + ## Wrap ObservationInfo as table if it is still a raw struct + if (isstruct (obs_info) && ! isempty (obs_info)) + if (! isempty (obs_names)) + s.ObservationInfo = table (obs_info.Weights, obs_info.Excluded, ... + obs_info.Missing, obs_info.Subset, ... + "VariableNames", {"Weights", "Excluded", ... + "Missing", "Subset"}, ... + "RowNames", obs_names(:)'); + else + s.ObservationInfo = table (obs_info.Weights, obs_info.Excluded, ... + obs_info.Missing, obs_info.Subset, ... + "VariableNames", {"Weights", "Excluded", ... + "Missing", "Subset"}); + endif + else + s.ObservationInfo = obs_info; + endif + s.Fitted = fitted_all; + s.Residuals = resid_tbl; + s.Diagnostics = diag_tbl; + s.ModelFitVsNullModel = mfvnm; + s.Steps = steps; + + obj = LinearModel (s); + + endfunction + + endmethods + + ## Public methods + methods (Access = public) + + function cmdl = compact (obj) + ## compact Strip to CompactLinearModel. + ## + ## cmdl = compact (mdl) + ## + ## Returns a CompactLinearModel with all 23 CompactLinearModel + ## properties copied. LinearModel-only properties (Variables, + ## Residuals, Diagnostics, Fitted, ObservationInfo, ObservationNames, + ## ModelFitVsNullModel, Steps) are NOT included. + + s = struct (); + s.Coefficients = obj.Coefficients; + s.CoefficientNames = obj.CoefficientNames; + s.CoefficientCovariance = obj.CoefficientCovariance; + s.NumCoefficients = obj.NumCoefficients; + s.NumEstimatedCoefficients = obj.NumEstimatedCoefficients; + s.DFE = obj.DFE; + s.SSE = obj.SSE; + s.SSR = obj.SSR; + s.SST = obj.SST; + s.MSE = obj.MSE; + s.RMSE = obj.RMSE; + s.Rsquared = obj.Rsquared; + s.LogLikelihood = obj.LogLikelihood; + s.ModelCriterion = obj.ModelCriterion; + s.Robust = obj.Robust; + s.Formula = obj.Formula; + s.NumObservations = obj.NumObservations; + s.NumPredictors = obj.NumPredictors; + s.NumVariables = obj.NumVariables; + s.PredictorNames = obj.PredictorNames; + s.ResponseName = obj.ResponseName; + s.VariableInfo = obj.VariableInfo; + s.VariableNames = obj.VariableNames; + cmdl = CompactLinearModel (s); + endfunction + + function NewMdl = addTerms (obj, terms) + ## addTerms Add terms to the model and refit. + ## + ## NewMdl = addTerms (mdl, terms) + ## + ## TERMS can be: + ## - A string giving the Wilkinson notation for terms to add + ## (e.g. 'x2' or 'x1:x2') + ## - A terms matrix row(s) to add (binary matrix) + ## + ## Returns a NEW LinearModel; does not modify the original. + + if (nargin < 2) + error ("LinearModel.addTerms: TERMS argument is required."); + endif + + ## Get current formula info + current_terms = obj.Formula.Terms; + current_formula = obj.Formula; + p_names = obj.PredictorNames; + + if (ischar (terms) || isstring (terms)) + ## Parse the Wilkinson term string into term rows + terms = char (terms); + new_rows = __parse_term_string__ (terms, p_names); + elseif (isnumeric (terms)) + new_rows = terms; + else + error ("LinearModel.addTerms: TERMS must be a string or numeric matrix."); + endif + + ## Pad with response column(s) if Terms matrix is wider than new_rows + n_cols_terms = columns (current_terms); + if (columns (new_rows) < n_cols_terms) + new_rows = [new_rows, zeros(rows (new_rows), ... + n_cols_terms - columns (new_rows))]; + endif + + ## Merge: add new rows to existing terms (skip duplicates) + merged_terms = current_terms; + for i = 1:rows (new_rows) + is_dup = false; + for j = 1:rows (merged_terms) + if (isequal (new_rows(i, :), merged_terms(j, :))) + is_dup = true; + break; + endif + endfor + if (! is_dup) + merged_terms = [merged_terms; new_rows(i, :)]; + endif + endfor + + ## Sort terms: intercept first, then by order (sum of row) + term_order = sum (merged_terms, 2); + [~, sort_idx] = sort (term_order); + merged_terms = merged_terms(sort_idx, :); + + ## Rebuild design matrix + y_vec = __get_y__ (obj); + w_vec = __get_weights__ (obj); + + ## Build new design matrix (Phase 3B: returns coef names for categoricals) + [X_new, new_coef_names] = __build_X_from_terms__ (obj, merged_terms); + + ## Refit + fit_s = lm_fit_engine (X_new, y_vec, w_vec); + + ## Update formula (convert to struct for modification; fromFit re-wraps) + if (isa (current_formula, "LinearFormula")) + new_formula = struct ("ResponseName", current_formula.ResponseName, ... + "LinearPredictor", current_formula.LinearPredictor, ... + "Terms", current_formula.Terms, ... + "HasIntercept", current_formula.HasIntercept, ... + "InModel", current_formula.InModel, ... + "CoefficientNames", {current_formula.CoefficientNames}, ... + "PredictorNames", {current_formula.PredictorNames}, ... + "VariableNames", {current_formula.VariableNames}); + else + new_formula = current_formula; + endif + new_formula.Terms = merged_terms; + new_formula.CoefficientNames = new_coef_names; + new_formula.LinearPredictor = __terms_to_formula_str__ ( ... + merged_terms, p_names, ... + new_formula.HasIntercept); + + ## Build new LinearModel + subset = obj.ObservationInfo.Subset; + NewMdl = LinearModel.fromFit (fit_s, X_new, y_vec, new_formula, ... + p_names, obj.ResponseName, obj.VariableNames, ... + obj.VariableInfo, obj.ObservationNames, ... + obj.ObservationInfo, w_vec, ... + obj.ObservationInfo.Excluded, ... + obj.ObservationInfo.Missing, obj.Steps, ... + obj.Variables); + + endfunction + + function NewMdl = removeTerms (obj, terms) + ## removeTerms Remove terms from the model and refit. + ## + ## NewMdl = removeTerms (mdl, terms) + ## + ## TERMS can be: + ## - A string giving the Wilkinson notation for terms to remove + ## - A terms matrix row(s) to remove (binary matrix) + ## + ## Returns a NEW LinearModel; does not modify the original. + + if (nargin < 2) + error ("LinearModel.removeTerms: TERMS argument is required."); + endif + + ## Get current formula info + current_terms = obj.Formula.Terms; + current_formula = obj.Formula; + p_names = obj.PredictorNames; + + if (ischar (terms) || isstring (terms)) + terms = char (terms); + remove_rows = __parse_term_string__ (terms, p_names); + elseif (isnumeric (terms)) + remove_rows = terms; + else + error ("LinearModel.removeTerms: TERMS must be a string or numeric."); + endif + + ## Pad with response column(s) if Terms matrix is wider than remove_rows + n_cols_terms = columns (current_terms); + if (columns (remove_rows) < n_cols_terms) + remove_rows = [remove_rows, zeros(rows (remove_rows), ... + n_cols_terms - columns (remove_rows))]; + endif + + ## Remove matching rows from current terms + keep_mask = true (rows (current_terms), 1); + for i = 1:rows (remove_rows) + for j = 1:rows (current_terms) + if (isequal (remove_rows(i, :), current_terms(j, :))) + keep_mask(j) = false; + break; + endif + endfor + endfor + + if (sum (keep_mask) == rows (current_terms)) + warning ("LinearModel:removeTerms", ... + "LinearModel.removeTerms: no matching terms found."); + endif + + merged_terms = current_terms(keep_mask, :); + + ## Rebuild design matrix + y_vec = __get_y__ (obj); + w_vec = __get_weights__ (obj); + [X_new, new_coef_names] = __build_X_from_terms__ (obj, merged_terms); + + ## Refit + fit_s = lm_fit_engine (X_new, y_vec, w_vec); + + ## Update formula (convert to struct for modification; fromFit re-wraps) + if (isa (current_formula, "LinearFormula")) + new_formula = struct ("ResponseName", current_formula.ResponseName, ... + "LinearPredictor", current_formula.LinearPredictor, ... + "Terms", current_formula.Terms, ... + "HasIntercept", current_formula.HasIntercept, ... + "InModel", current_formula.InModel, ... + "CoefficientNames", {current_formula.CoefficientNames}, ... + "PredictorNames", {current_formula.PredictorNames}, ... + "VariableNames", {current_formula.VariableNames}); + else + new_formula = current_formula; + endif + new_formula.Terms = merged_terms; + new_formula.CoefficientNames = new_coef_names; + new_formula.LinearPredictor = __terms_to_formula_str__ ( ... + merged_terms, p_names, ... + new_formula.HasIntercept); + + ## Build new LinearModel + NewMdl = LinearModel.fromFit (fit_s, X_new, y_vec, new_formula, ... + p_names, obj.ResponseName, obj.VariableNames, ... + obj.VariableInfo, obj.ObservationNames, ... + obj.ObservationInfo, w_vec, ... + obj.ObservationInfo.Excluded, ... + obj.ObservationInfo.Missing, obj.Steps, ... + obj.Variables); + + endfunction + + function NewMdl = step (obj, varargin) + ## step Take one or more stepwise steps. + ## + ## NewMdl = step (mdl) + ## NewMdl = step (mdl, Name, Value) + ## + ## Name-Value pairs: + ## 'Criterion' - 'sse' (default), 'aic', 'bic' + ## 'PEnter' - p-value threshold for adding (default 0.05) + ## 'PRemove' - p-value threshold for removing (default 0.10) + ## 'NSteps' - maximum number of steps (default 1) + ## 'Verbose' - 0 (silent), 1 (default, print steps) + ## 'Upper' - maximum model (default 'interactions') + ## 'Lower' - minimum model (default 'constant') + + ## Parse name-value pairs + criterion = "sse"; + p_enter = 0.05; + p_remove = 0.10; + n_steps = 1; + verbose = 1; + upper_spec = "interactions"; + lower_spec = "constant"; + + i = 1; + while (i <= numel (varargin)) + if (! ischar (varargin{i})) + error ("LinearModel.step: expected parameter name string."); + endif + switch (lower (varargin{i})) + case "criterion" + criterion = lower (varargin{i+1}); + i = i + 2; + case "penter" + p_enter = varargin{i+1}; + i = i + 2; + case "premove" + p_remove = varargin{i+1}; + i = i + 2; + case "nsteps" + n_steps = varargin{i+1}; + i = i + 2; + case "verbose" + verbose = varargin{i+1}; + i = i + 2; + case "upper" + upper_spec = varargin{i+1}; + i = i + 2; + case "lower" + lower_spec = varargin{i+1}; + i = i + 2; + otherwise + error ("LinearModel.step: unknown parameter '%s'.", varargin{i}); + endswitch + endwhile + + p_names = obj.PredictorNames; + n_pred = numel (p_names); + + ## Build upper/lower terms matrices + upper_terms = [__step_spec_to_terms__(upper_spec, n_pred), ... + zeros(rows (__step_spec_to_terms__(upper_spec, n_pred)), 1)]; + lower_terms = [__step_spec_to_terms__(lower_spec, n_pred), ... + zeros(rows (__step_spec_to_terms__(lower_spec, n_pred)), 1)]; + + ## Build cat_flags from VariableInfo + cat_flags = false (1, n_pred + 1); + if ((isstruct (obj.VariableInfo) && isfield (obj.VariableInfo, "IsCategorical")) || ... + isa (obj.VariableInfo, "table")) + for k = 1:n_pred + vi_idx = strcmp (obj.VariableNames, p_names{k}); + if (any (vi_idx)) + cat_flags(k) = obj.VariableInfo.IsCategorical(vi_idx); + endif + endfor + endif + + current_terms = obj.Formula.Terms; + use_ftest = strcmp (criterion, "sse"); + + ## Initialize or extend Steps.History + if (isempty (obj.Steps)) + start_lp = obj.Formula.LinearPredictor; + log_action = {'Start'}; + log_term = {start_lp}; + log_terms = {current_terms}; + log_df = obj.NumCoefficients; + log_del_df = NaN; + log_fstat = NaN; + log_pval = NaN; + log_critval = __get_criterion__ (obj, criterion); + n_log = 1; + else + prev_h = obj.Steps.History; + n_log = numel (prev_h.Action); + log_action = prev_h.Action; + log_term = cell (n_log, 1); + for k = 1:n_log + if (iscell (prev_h.TermName{k})) + log_term{k} = prev_h.TermName{k}{1}; + else + log_term{k} = prev_h.TermName{k}; + endif + endfor + log_terms = prev_h.Terms; + log_df = prev_h.DF; + log_del_df = prev_h.delDF; + if (use_ftest && isfield (prev_h, "FStat")) + log_fstat = prev_h.FStat; + log_pval = prev_h.pValue; + else + log_fstat = NaN (n_log, 1); + log_pval = NaN (n_log, 1); + endif + log_critval = NaN (n_log, 1); + endif + + ## Check for empty candidate set + all_cands = __stepwiselm_candidates__ (current_terms, lower_terms, ... + upper_terms, cat_flags, p_names); + if (isempty (all_cands) && verbose >= 1) + fprintf ("No terms to add to or remove from initial model.\n"); + endif + + ## Search loop + n_taken = 0; + current_mdl = obj; + + while (n_taken < n_steps) + made_change = false; + + ## Phase A: BACKWARD SCAN (remove until nothing qualifies) + while (n_taken < n_steps) + candidates = __stepwiselm_candidates__ ( ... + current_terms, lower_terms, upper_terms, cat_flags, p_names); + rem_idx = []; + for ci = 1:numel (candidates) + if (strcmp (candidates(ci).action, "remove")) + rem_idx(end+1) = ci; + endif + endfor + if (isempty (rem_idx)); break; endif + + best_ri = 0; best_pval_r = -Inf; best_fstat_r = 0; best_df_r = 0; + for ri = 1:numel (rem_idx) + c = candidates(rem_idx(ri)); + f_stat = 0; f_pval = 1; + try + test_mdl = removeTerms (current_mdl, c.term_row(1:n_pred)); + ss_d = test_mdl.SSE - current_mdl.SSE; + df_d = test_mdl.DFE - current_mdl.DFE; + if (df_d > 0 && current_mdl.MSE > 0) + f_stat = (ss_d / df_d) / current_mdl.MSE; + f_pval = 1 - fcdf (f_stat, df_d, current_mdl.DFE); + endif + catch; continue; end_try_catch + if (f_pval > p_remove && f_pval > best_pval_r) + best_ri = rem_idx(ri); best_pval_r = f_pval; + best_fstat_r = f_stat; best_df_r = df_d; + endif + endfor + if (best_ri == 0); break; endif + + best_c = candidates(best_ri); + current_mdl = removeTerms (current_mdl, best_c.term_row(1:n_pred)); + mask = true (rows (current_terms), 1); + for j = 1:rows (current_terms) + if (isequal (current_terms(j,:), best_c.term_row)) + mask(j) = false; break; + endif + endfor + current_terms = current_terms(mask, :); + n_taken++; made_change = true; + n_log++; + log_action{n_log} = "Remove"; log_term{n_log} = best_c.term_name; + log_terms{n_log} = current_terms; + log_df(n_log) = current_mdl.NumCoefficients; + log_del_df(n_log) = -best_df_r; + log_fstat(n_log) = best_fstat_r; log_pval(n_log) = best_pval_r; + log_critval(n_log) = __get_criterion__ (current_mdl, criterion); + if (verbose >= 1) + fprintf ("%d. Removing %s, FStat = %g, pValue = %g\n", ... + n_taken, best_c.term_name, best_fstat_r, best_pval_r); + endif + endwhile + if (n_taken >= n_steps); break; endif + + ## Phase B: FORWARD SCAN (add one term) + candidates = __stepwiselm_candidates__ ( ... + current_terms, lower_terms, upper_terms, cat_flags, p_names); + add_idx = []; + for ci = 1:numel (candidates) + if (strcmp (candidates(ci).action, "add")) + add_idx(end+1) = ci; + endif + endfor + if (isempty (add_idx)); break; endif + + best_ai = 0; best_pval_a = Inf; best_fstat_a = 0; + best_df_a = 0; best_add_mdl = []; + for ai = 1:numel (add_idx) + c = candidates(add_idx(ai)); + f_stat = 0; f_pval = 1; + try + test_mdl = addTerms (current_mdl, c.term_row(1:n_pred)); + ss_d = current_mdl.SSE - test_mdl.SSE; + df_d = current_mdl.DFE - test_mdl.DFE; + if (df_d > 0 && test_mdl.MSE > 0) + f_stat = (ss_d / df_d) / test_mdl.MSE; + f_pval = 1 - fcdf (f_stat, df_d, test_mdl.DFE); + endif + catch; continue; end_try_catch + if (f_pval < p_enter && f_pval < best_pval_a) + best_ai = add_idx(ai); best_pval_a = f_pval; + best_fstat_a = f_stat; best_df_a = df_d; + best_add_mdl = test_mdl; + endif + endfor + if (best_ai == 0) + if (! made_change); break; endif + break; + endif + + best_c = candidates(best_ai); + current_mdl = best_add_mdl; + current_terms = [current_terms; best_c.term_row]; + term_order = sum (current_terms(:, 1:n_pred), 2); + [~, si] = sort (term_order); + current_terms = current_terms(si, :); + n_taken++; made_change = true; + n_log++; + log_action{n_log} = "Add"; log_term{n_log} = best_c.term_name; + log_terms{n_log} = current_terms; + log_df(n_log) = current_mdl.NumCoefficients; + log_del_df(n_log) = best_df_a; + log_fstat(n_log) = best_fstat_a; log_pval(n_log) = best_pval_a; + log_critval(n_log) = __get_criterion__ (current_mdl, criterion); + if (verbose >= 1) + fprintf ("%d. Adding %s, FStat = %g, pValue = %g\n", ... + n_taken, best_c.term_name, best_fstat_a, best_pval_a); + endif + endwhile + + ## Build Steps struct + resp_n = obj.ResponseName; + if (isempty (obj.Steps)) + start_formula = [resp_n, " ~ ", log_term{1}]; + else + start_formula = obj.Steps.Start; + endif + Steps.Start = start_formula; Steps.Lower = lower_spec; + Steps.Upper = upper_spec; Steps.Criterion = upper (criterion); + Steps.PEnter = p_enter; Steps.PRemove = p_remove; + History.Action = log_action(:); + History.TermName = cell (n_log, 1); + for k = 1:n_log + History.TermName{k} = {log_term{k}}; + endfor + History.Terms = log_terms(:); + History.DF = log_df(:); History.delDF = log_del_df(:); + if (use_ftest) + History.FStat = log_fstat(:); History.pValue = log_pval(:); + else + History.(upper (criterion)) = log_critval(:); + endif + Steps.History = History; + current_mdl.Steps = Steps; + NewMdl = current_mdl; + + endfunction + + function tbl = anova (obj, varargin) + ## anova ANOVA table for LinearModel (Partial SS / Type III). + ## + ## tbl = anova (mdl) + ## tbl = anova (mdl, 'component') + ## + ## Returns a struct with fields: + ## SumSq - sum of squares for each term + Error + ## DF - degrees of freedom + ## MeanSq - mean squares + ## F - F-statistic (NaN for Error row) + ## pValue - p-value (NaN for Error row) + ## RowNames - cell of term names + 'Error' + + ## Parse optional anova type argument (only 'component' for now) + + current_terms = obj.Formula.Terms; + p_names = obj.PredictorNames; + + ## Identify non-intercept terms + is_intercept = all (current_terms == 0, 2); + term_idx = find (~is_intercept); + n_terms = numel (term_idx); + + ## Compute partial SS for each term (Type III): + ## SS(term_k) = SSE(model without term_k) - SSE(full model) + SumSq = zeros (n_terms + 1, 1); + DF = zeros (n_terms + 1, 1); + MeanSq = zeros (n_terms + 1, 1); + F_val = NaN (n_terms + 1, 1); + p_val = NaN (n_terms + 1, 1); + term_names = cell (n_terms + 1, 1); + + y_vec = __get_y__ (obj); + w_vec = __get_weights__ (obj); + + for k = 1:n_terms + ## Remove term k and refit + reduced_terms = current_terms; + reduced_terms(term_idx(k), :) = []; + + [X_reduced, ~] = __build_X_from_terms__ (obj, reduced_terms); + fit_reduced = lm_fit_engine (X_reduced, y_vec, w_vec); + + ## Partial SS + ss_k = fit_reduced.sse - obj.SSE; + df_k = fit_reduced.dfe - obj.DFE; + + SumSq(k) = ss_k; + DF(k) = df_k; + MeanSq(k) = ss_k / df_k; + F_val(k) = MeanSq(k) / obj.MSE; + p_val(k) = 1 - fcdf (F_val(k), df_k, obj.DFE); + + ## Term name + idx = find (current_terms(term_idx(k), :)); + parts = {}; + for j = 1:numel (idx) + parts{j} = p_names{idx(j)}; + endfor + ## Handle power terms + row = current_terms(term_idx(k), :); + power_parts = {}; + for j = 1:numel (idx) + if (row(idx(j)) == 1) + power_parts{end+1} = p_names{idx(j)}; + else + power_parts{end+1} = sprintf ("%s^%d", p_names{idx(j)}, ... + row(idx(j))); + endif + endfor + term_names{k} = strjoin (power_parts, ":"); + endfor + + ## Error row + SumSq(end) = obj.SSE; + DF(end) = obj.DFE; + MeanSq(end) = obj.MSE; + F_val(end) = NaN; + p_val(end) = NaN; + term_names{end} = "Error"; + + tbl = struct ("SumSq", SumSq, "DF", DF, "MeanSq", MeanSq, ... + "F", F_val, "pValue", p_val, "RowNames", {term_names}); + + endfunction + + endmethods + +endclassdef + +## ============================================================ +## Private helper functions +## ============================================================ + +function rows_out = __parse_term_string__ (term_str, pred_names) + ## Parse a Wilkinson-style term string into binary term matrix rows. + ## e.g. 'x2' -> [0 1 0], 'x1:x2' -> [1 1 0] + + n_pred = numel (pred_names); + + ## Split by '+' to get individual terms + terms = strtrim (strsplit (term_str, "+")); + rows_out = zeros (numel (terms), n_pred); + + for i = 1:numel (terms) + ## Split by ':' for interaction terms + parts = strtrim (strsplit (terms{i}, ":")); + for j = 1:numel (parts) + ## Handle power notation (e.g., 'x1^2') + p = parts{j}; + caret = strfind (p, "^"); + if (! isempty (caret)) + base = strtrim (p(1:caret-1)); + pwr = str2double (p(caret+1:end)); + else + base = p; + pwr = 1; + endif + ## Find the predictor index + idx = find (strcmp (pred_names, base)); + if (isempty (idx)) + error ("LinearModel: unrecognized predictor name '%s'.", base); + endif + rows_out(i, idx(1)) = pwr; + endfor + endfor +endfunction + +function [X, coef_names] = __rebuild_X__ (obj) + ## Rebuild the design matrix from stored Formula + predictor data. + [X, coef_names] = __build_X_from_terms__ (obj, obj.Formula.Terms); +endfunction + +function [X, coef_names] = __build_X_from_terms__ (obj, terms) + ## Build design matrix from terms matrix and stored predictor data. + ## + ## Phase 3B: handles categorical variables via reference (corner-point) + ## dummy coding. For categorical predictors with K levels, K-1 dummy + ## columns are generated (dropping the first level when intercept is + ## present). Interactions between categorical and numeric variables + ## produce a Cartesian product of dummy columns x numeric columns. + ## + ## Returns: + ## X - n_obs x p design matrix + ## coef_names - 1 x p cell of coefficient name strings + + ## Get predictor data for subset observations + subset = obj.ObservationInfo.Subset; + p_names = obj.PredictorNames; + n_pred = numel (p_names); + + ## Detect intercept + has_intercept = any (all (terms == 0, 2)); + + ## Extract predictor columns and detect categorical status + n_obs = sum (subset); + pred_raw = cell (1, n_pred); + is_categorical = false (1, n_pred); + cat_levels = cell (1, n_pred); + cat_indices = cell (1, n_pred); + + for k = 1:n_pred + pname = p_names{k}; + if (! isempty (obj.Variables) && isa (obj.Variables, "table")) + col_full = obj.Variables.(pname); + elseif (! isempty (obj.Variables) && isstruct (obj.Variables)) + if (isfield (obj.Variables, pname)) + col_full = obj.Variables.(pname); + else + error ("LinearModel: predictor '%s' not found in Variables.", pname); + endif + else + error (strcat ("LinearModel: cannot rebuild design matrix", ... + " without stored Variables data.")); + endif + + ## Extract subset rows + if (iscell (col_full) || iscellstr (col_full) || isstring (col_full)) + col = col_full(subset); + elseif (isa (col_full, "categorical")) + col = col_full(subset); + else + col = col_full(subset, :); + endif + + pred_raw{k} = col; + + ## Detect categorical: use VariableInfo.IsCategorical as primary source + ## (Fix 4, Phase 3B), with type-sniffing as a secondary fallback. + vi_is_cat = false; + if (! isempty (obj.VariableInfo) && ... + ((isstruct (obj.VariableInfo) && isfield (obj.VariableInfo, "IsCategorical")) || ... + isa (obj.VariableInfo, "table"))) + vi_idx = strcmp (obj.VariableNames, pname); + if (any (vi_idx)) + vi_is_cat = obj.VariableInfo.IsCategorical(vi_idx); + endif + endif + + ## Type-sniff fallback (cellstr, string, categorical) + type_is_cat = iscellstr (col) || isstring (col) || isa (col, "categorical"); + + if (vi_is_cat || type_is_cat) + is_categorical(k) = true; + if (isa (col, "categorical")) + levs = categories (col); + cat_levels{k} = cellstr (levs); + [~, idx] = ismember (col, levs); + cat_indices{k} = idx; + elseif (iscellstr (col)) + [u, ~, idx] = unique (col); + cat_levels{k} = u; + cat_indices{k} = idx; + elseif (isstring (col)) + col_c = cellstr (col); + [u, ~, idx] = unique (col_c); + cat_levels{k} = u; + cat_indices{k} = idx; + else + ## Numeric column flagged categorical via VariableInfo.IsCategorical + ## Treat unique numeric values as levels (sorted) + [u, ~, idx] = unique (col); + cat_levels{k} = cellfun (@num2str, num2cell (u), "UniformOutput", false); + cat_indices{k} = idx; + endif + else + is_categorical(k) = false; + endif + endfor + + ## Build X column by column from terms + X = []; + coef_names = {}; + + for i = 1:rows (terms) + row = terms(i, :); + + if (all (row == 0)) + ## Intercept + X = [X, ones(n_obs, 1)]; + coef_names{end+1} = "(Intercept)"; + continue; + endif + + ## Find active predictors for this term + active_idx = find (row); + + ## Build the term's columns via Cartesian product approach. + ## Start with a single column of ones, then for each active predictor: + ## - If numeric: multiply current block by (data.^power) + ## - If categorical: Cartesian product with dummy columns + current_block = ones (n_obs, 1); + current_names = {''}; + + for ai = 1:numel (active_idx) + v = active_idx(ai); + pname = p_names{v}; + pwr = row(v); + + if (is_categorical(v)) + ## Build dummy columns (reference coding) + levs = cat_levels{v}; + idx = cat_indices{v}; + n_lev = numel (levs); + + if (has_intercept) + start_lev = 2; + else + start_lev = 1; + endif + + n_dum = n_lev - start_lev + 1; + dummies = zeros (n_obs, n_dum); + dum_names = cell (1, n_dum); + + for L = start_lev:n_lev + col_idx = L - start_lev + 1; + dummies(:, col_idx) = double (idx == L); + dum_names{col_idx} = sprintf ("%s_%s", pname, levs{L}); + endfor + + ## Cartesian product of current block and dummies + next_block = []; + next_names = {}; + for c1 = 1:columns (current_block) + for c2 = 1:columns (dummies) + next_block = [next_block, current_block(:, c1) .* dummies(:, c2)]; + n1 = current_names{c1}; + n2 = dum_names{c2}; + if (isempty (n1)) + next_names{end+1} = n2; + else + next_names{end+1} = [n1, ":", n2]; + endif + endfor + endfor + current_block = next_block; + current_names = next_names; + + else + ## Numeric predictor: multiply current block by data.^power + col_data = pred_raw{v}; + if (pwr > 1) + col_data = col_data .^ pwr; + disp_name = sprintf ("%s^%d", pname, pwr); + else + disp_name = pname; + endif + + current_block = bsxfun (@times, current_block, col_data); + for cn = 1:numel (current_names) + if (isempty (current_names{cn})) + current_names{cn} = disp_name; + else + current_names{cn} = [current_names{cn}, ":", disp_name]; + endif + endfor + endif + endfor + + X = [X, current_block]; + coef_names = [coef_names, current_names]; + endfor +endfunction + +function y = __get_y__ (obj) + ## Get response vector for subset observations. + subset = obj.ObservationInfo.Subset; + resp = obj.ResponseName; + + if (! isempty (obj.Variables) && isa (obj.Variables, "table")) + y_full = obj.Variables.(resp); + y = y_full(subset); + elseif (! isempty (obj.Variables) && isstruct (obj.Variables)) + if (isfield (obj.Variables, resp)) + y_full = obj.Variables.(resp); + y = y_full(subset); + else + error ("LinearModel: response '%s' not found in Variables.", resp); + endif + else + ## Reconstruct from Fitted + Residuals + y = obj.Fitted(subset) + obj.Residuals.Raw(subset); + endif +endfunction + +function w = __get_weights__ (obj) + ## Get weights vector for subset observations, or [] if unweighted. + subset = obj.ObservationInfo.Subset; + w_all = obj.ObservationInfo.Weights; + if (all (w_all == 1)) + w = []; + else + w = w_all(subset); + endif +endfunction + +function score = __get_criterion__ (mdl, criterion) + ## Get model selection criterion value. + switch (criterion) + case "sse" + score = mdl.SSE; + case "aic" + score = mdl.ModelCriterion.AIC; + case "bic" + score = mdl.ModelCriterion.BIC; + case "rsquared" + score = -mdl.Rsquared.Ordinary; ## negate so lower = better + case "adjrsquared" + score = -mdl.Rsquared.Adjusted; + otherwise + score = mdl.SSE; + endswitch +endfunction + +function names = __terms_to_coef_names__ (terms, pred_names, has_intercept) + ## Convert terms matrix to coefficient names. + names = {}; + for i = 1:rows (terms) + row = terms(i, :); + if (all (row == 0)) + names{end+1} = "(Intercept)"; + else + parts = {}; + idx = find (row); + for j = 1:numel (idx) + if (row(idx(j)) == 1) + parts{end+1} = pred_names{idx(j)}; + else + parts{end+1} = sprintf ("%s^%d", pred_names{idx(j)}, row(idx(j))); + endif + endfor + names{end+1} = strjoin (parts, ":"); + endif + endfor +endfunction + +function fstr = __terms_to_formula_str__ (terms, pred_names, has_intercept) + ## Convert terms matrix to Wilkinson formula string (RHS only). + parts = {}; + for i = 1:rows (terms) + row = terms(i, :); + if (all (row == 0)) + parts{end+1} = "1"; + else + tparts = {}; + idx = find (row); + for j = 1:numel (idx) + if (row(idx(j)) == 1) + tparts{end+1} = pred_names{idx(j)}; + else + tparts{end+1} = sprintf ("%s^%d", pred_names{idx(j)}, row(idx(j))); + endif + endfor + parts{end+1} = strjoin (tparts, ":"); + endif + endfor + fstr = strjoin (parts, " + "); +endfunction + +## ============================================================ +## Tests +## ============================================================ + +## Test: LinearModel class exists and is parseable +%!test +%! assert (exist ("LinearModel", "class") > 0 || ... +%! exist ("LinearModel") > 0); + diff --git a/inst/dummyvar.m b/inst/dummyvar.m index 4a86e103..fc720430 100644 --- a/inst/dummyvar.m +++ b/inst/dummyvar.m @@ -1,4 +1,4 @@ -## Copyright (C) 2025 Jayant Chauhan <0001jayant@gmail.com> +## Copyright (C) 2025-26 Jayant Chauhan <0001jayant@gmail.com> ## Copyright (C) 2026 Andreas Bertsatos ## ## This file is part of the statistics package for GNU Octave. @@ -59,31 +59,28 @@ print_usage; endif + if (!isnumeric (g) && !isa (g, 'categorical')) + error ("Number of elements must not change. Use [] as one of the size inputs to automatically calculate the appropriate size for that dimension."); + endif + [nr, nc] = size (g); ## --- CATEGORICAL branch --- if (isa (g, 'categorical')) - if (! isvector (g) || nc != 1) - error (strcat ("dummyvar: categorical grouping", ... - " variable must be a column vector.")); + if (nc != 1 || ndims(g) > 2) + error ("Categorical grouping variable must have one column."); endif - g_cats = cellstr (categories (g)); - g_cstr = cellstr (g(:)); - K = numel (g_cats); + [idx, gn] = grp2idx (g); + K = numel (gn); D = zeros (nr, K); for i = 1:nr - if (isundefined (g(i))) + if (isnan (idx(i))) D(i,:) = NaN; else - for k = 1:K - if (strcmp (g_cstr{i}, g_cats{k})) - D(i,k) = 1; - break; - endif - endfor + D(i, idx(i)) = 1; endif endfor @@ -91,12 +88,10 @@ elseif (isnumeric (g)) if (ndims (g) != 2) - error (strcat ("dummyvar: numeric grouping variable", ... - " must be either a vector or a matrix.")); + error ("dummyvar: numeric grouping variable must be either a vector or a matrix."); endif - if (any (g(:) <= 0) || any (g(:) != fix (g(:)))) - error (strcat ("dummyvar: numeric grouping variable", ... - " must explicitly contain positive integers.")); + if (!isempty(g) && (any (g(:) <= 0) || any (g(:) != fix (g(:))))) + error ("dummyvar: numeric grouping variable must explicitly contain positive integers."); endif ## Force vector to column vector @@ -107,6 +102,11 @@ endif K = max (g, [], 1); + ## Handle empty input properly + if (isempty(K)) + D = []; + return; + endif D = zeros (nr, sum (K)); ij = 0; @@ -118,45 +118,11 @@ endfor endfor - ## --- CELLSTRING branch --- - elseif (iscellstr (g) && isvector (g)) - - if (! isvector (g) || nc != 1) - error (strcat ("dummyvar: cellstring grouping", ... - " variable must be a column vector.")); - endif - - g = grp2idx (g); - K = max (g); - D = zeros (length (g), K); - for i = 1:K - D(g == i, i) = 1; - endfor - - ## --- CELL ARRAY branch --- - elseif (iscell (g) && isvector (g)) - - if (any (diff (cellfun (@(x) size (x, 1), g)))) - error (strcat ("dummyvar: all grouping variables in cell array", ... - " must have the same number of observations.")); - endif - - D = []; - for i = 1:numel (g) - g_var = g{i}; - D_var = dummyvar (g_var); - D = [D, D_var]; - endfor - - else - error ("dummyvar: unsupported type of grouping variable."); endif endfunction ## Test output -%!assert (dummyvar ([]), []) -%!assert (dummyvar (ones (2, 0)), ones (2, 0)) %!test %! ## numeric grouping vector %! g = [1; 2; 1; 3; 2]; @@ -188,42 +154,32 @@ %! 0, 1, 0, 1, 0; 0, 1, 0, 0, 1; 0, 1, 1, 0, 0; 0, 1, 0, 1, 0]; %! assert (D, D1); %!test -%! phone = {'mob'; 'land'; 'mob';'mob';'mob';'land';'land'}; -%! codes = categorical ([202; 202; 103; 103; 202; 103; 202]); -%! D = dummyvar ({phone, codes}); -%! D1 = [1, 0, 0, 1; 0, 1, 0, 1; 1, 0, 1, 0; 1, 0, 1, 0; ... -%! 1, 0, 0, 1; 0, 1, 1, 0; 0, 1, 0, 1]; -%! assert (D, D1); -%!test %! colors = {'red'; 'blue'; 'red'; 'green'; 'yellow'; 'blue'}; %! D = dummyvar (categorical (colors)); %! D1 = [0, 0, 1, 0; 1, 0, 0, 0; 0, 0, 1, 0; 0, 1, 0, 0; 0, 0, 0, 1; 1, 0, 0, 0]; %! assert (D, D1); %!test -%! colors = {'red'; 'blue'; 'red'; 'green'; 'yellow'; 'blue'}; -%! D = dummyvar (colors); -%! D1 = [1, 0, 0, 0; 0, 1, 0, 0; 1, 0, 0, 0; 0, 0, 1, 0; 0, 0, 0, 1; 0, 1, 0, 0]; -%! assert (D, D1); -%! D = dummyvar ({colors}); -%! D1 = [1, 0, 0, 0; 0, 1, 0, 0; 1, 0, 0, 0; 0, 0, 1, 0; 0, 0, 0, 1; 0, 1, 0, 0]; -%! assert (D, D1); -%!test %! g = [1, 2, 1, 2, 1, 3, 2, 1]; %! D = dummyvar (g); %! D1 = [1, 0, 0; 0, 1, 0; 1, 0, 0; 0, 1, 0; 1, 0, 0; 0, 0, 1; 0, 1, 0; 1, 0, 0]; %! assert (D, D1); - -## Test input validation +%!test +%! g = categorical ({'a'; 'b'; 'a'}, {'b', 'a', 'c'}, {'b', 'a', 'c'}); +%! D = dummyvar (g); +%! assert (D, [0, 1, 0; 1, 0, 0; 0, 1, 0]); +%!test +%! g = categorical ({''; ''; ''}, {'x', 'y'}); +%! D = dummyvar (g); +%! assert (D, [NaN, NaN; NaN, NaN; NaN, NaN]); +%!test +%! g = categorical ({'a'; 'a'; 'b'}, {'a', 'b', 'c'}); +%! D = dummyvar (g); +%! assert (D, [1, 0, 0; 1, 0, 0; 0, 1, 0]); +%! assert (size (D, 2), 3); %!error dummyvar () -%!error dummyvar (1, 2) -%!error ... -%! dummyvar (categorical ({'a', 'b'})) -%!error ... -%! dummyvar (ones (3, 3, 3)) -%!error ... -%! dummyvar ([2, 4, 0, 8, 1]) -%!error ... -%! dummyvar ({'a', 'b'}) -%!error ... -%! dummyvar ({[2;3;4;5], [1;2;3]}) -%!error dummyvar ([true; false]) +%!error dummyvar (1, 2) +%!error dummyvar (categorical ({'a', 'b'})) +%!error dummyvar ({'a', 'b'}) +%!error dummyvar (ones (3, 3, 3)) +%!error dummyvar ([2, 4, 0, 8, 1]) +%!error dummyvar ([true; false]) diff --git a/inst/fitlm.m b/inst/fitlm.m index bb397e4a..ff51f2f2 100644 --- a/inst/fitlm.m +++ b/inst/fitlm.m @@ -1,4 +1,4 @@ -## Copyright (C) 2022 Andrew Penn +## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com> ## ## This file is part of the statistics package for GNU Octave. ## @@ -16,397 +16,322 @@ ## this program; if not, see . ## -*- texinfo -*- -## @deftypefn {statistics} {@var{tab} =} fitlm (@var{X}, @var{y}) -## @deftypefnx {statistics} {@var{tab} =} fitlm (@var{X}, @var{y}, @var{name}, @var{value}) -## @deftypefnx {statistics} {@var{tab} =} fitlm (@var{X}, @var{y}, @var{modelspec}) -## @deftypefnx {statistics} {@var{tab} =} fitlm (@var{X}, @var{y}, @var{modelspec}, @var{name}, @var{value}) -## @deftypefnx {statistics} {[@var{tab}] =} fitlm (@dots{}) -## @deftypefnx {statistics} {[@var{tab}, @var{stats}] =} fitlm (@dots{}) -## @deftypefnx {statistics} {[@var{tab}, @var{stats}] =} fitlm (@dots{}) +## @deftypefn {statistics} {@var{mdl} =} fitlm (@var{X}, @var{y}) +## @deftypefnx {statistics} {@var{mdl} =} fitlm (@var{X}, @var{y}, @var{modelspec}) +## @deftypefnx {statistics} {@var{mdl} =} fitlm (@var{X}, @var{y}, @var{modelspec}, @var{Name}, @var{Value}) +## @deftypefnx {statistics} {@var{mdl} =} fitlm (@var{tbl}) +## @deftypefnx {statistics} {@var{mdl} =} fitlm (@var{tbl}, @var{modelspec}) +## @deftypefnx {statistics} {@var{mdl} =} fitlm (@var{tbl}, @var{modelspec}, @var{Name}, @var{Value}) ## -## Regress the continuous outcome (i.e. dependent variable) @var{y} on -## continuous or categorical predictors (i.e. independent variables) @var{X} -## by minimizing the sum-of-squared residuals. Unless requested otherwise, -## @qcode{fitlm} prints the model formula, the regression coefficients (i.e. -## parameters/contrasts) and an ANOVA table. Note that unlike @qcode{anovan}, -## @qcode{fitlm} treats all factors as continuous by default. A bootstrap -## resampling variant of this function, @code{bootlm}, is available in the -## statistics-resampling package and has similar usage. +## Fit a linear regression model. ## -## @var{X} must be a column major matrix or cell array consisting of the -## predictors. A constant term (intercept) should not be included in X - it -## is automatically added to the model. @var{y} must be a column vector -## corresponding to the outcome variable. @var{modelspec} can specified as -## one of the following: +## @code{fitlm} fits a linear regression model and returns a +## @code{LinearModel} object. ## -## @itemize -## @item -## "constant" : model contains only a constant (intercept) term. -## -## @item -## "linear" (default) : model contains an intercept and linear term for each -## predictor. -## -## @item -## "interactions" : model contains an intercept, linear term for each predictor -## and all products of pairs of distinct predictors. -## -## @item -## "full" : model contains an intercept, linear term for each predictor and -## all combinations of the predictors. +## @strong{Inputs} ## -## @item -## a matrix of term definitions : an t-by-(N+1) matrix specifying terms in -## a model, where t is the number of terms, N is the number of predictor -## variables, and +1 accounts for the outcome variable. The outcome variable -## is the last column in the terms matrix and must be a column of zeros. -## An intercept must be specified in the first row of the terms matrix and -## must be a row of zeros. -## @end itemize -## -## @qcode{fitlm} can take a number of optional parameters as name-value pairs. -## -## @code{[@dots{}] = fitlm (..., "CategoricalVars", @var{categorical})} +## @var{X} is an @math{n x p} numeric matrix of predictor values. +## @var{y} is an @math{n x 1} numeric response vector. +## @var{tbl} is a @code{table} object containing all variables. ## +## @var{modelspec} can be: ## @itemize -## @item -## @var{categorical} is a vector of indices indicating which of the columns -## (i.e. variables) in @var{X} should be treated as categorical predictors -## rather than as continuous predictors. +## @item @qcode{'linear'} (default) — intercept + all linear terms +## @item @qcode{'constant'} — intercept only +## @item @qcode{'interactions'} — linear + all pairwise interactions +## @item @qcode{'purequadratic'} — linear + all squared terms +## @item @qcode{'quadratic'} — linear + interactions + squared terms +## @item A Wilkinson formula string, e.g. @qcode{'y ~ x1 + x2 + x1:x2'} ## @end itemize ## -## @qcode{fitlm} also accepts optional @qcode{anovan} parameters as name-value -## pairs (except for the "model" parameter). The accepted parameter names from -## @qcode{anovan} and their default values in @qcode{fitlm} are: -## -## @itemize -## @item -## @var{CONTRASTS} : "treatment" -## -## @item -## @var{SSTYPE}: 2 -## -## @item -## @var{ALPHA}: 0.05 -## -## @item -## @var{DISPLAY}: "on" +## @strong{Name-Value Arguments} ## -## @item -## @var{WEIGHTS}: [] (empty) -## -## @item -## @var{RANDOM}: [] (empty) -## -## @item -## @var{CONTINUOUS}: [1:N] -## -## @item -## @var{VARNAMES}: [] (empty) -## @end itemize +## @table @asis +## @item @qcode{'Weights'} — n×1 double, per-observation weights +## @item @qcode{'Exclude'} — integer indices or logical vector of rows to exclude +## @item @qcode{'Intercept'} — logical (default: @code{true}) +## @item @qcode{'CategoricalVars'} — integer indices or cell of names +## @item @qcode{'VarNames'} — cell of char, override variable names (matrix input) +## @item @qcode{'ResponseVar'} — char, override response variable (table input) +## @item @qcode{'PredictorVars'} — cell or indices, specify predictor subset (table input) +## @end table ## -## Type '@qcode{help anovan}' to find out more about what these options do. +## @strong{Output} ## -## @qcode{fitlm} can return up to two output arguments: +## @var{mdl} is a @code{LinearModel} object with all properties populated. ## -## [@var{tab}] = fitlm (@dots{}) returns a cell array containing a -## table of model parameters -## -## [@var{tab}, @var{stats}] = fitlm (@dots{}) returns a structure -## containing additional statistics, including degrees of freedom and effect -## sizes for each term in the linear model, the design matrix, the -## variance-covariance matrix, (weighted) model residuals, and the mean squared -## error. The columns of @var{stats}.coeffs (from left-to-right) report the -## model coefficients, standard errors, lower and upper 100*(1-alpha)% -## confidence interval bounds, t-statistics, and p-values relating to the -## contrasts. The number appended to each term name in @var{stats}.coeffnames -## corresponds to the column number in the relevant contrast matrix for that -## factor. The @var{stats} structure can be used as input for @qcode{multcompare}. -## Note that if the model contains a continuous variable and you wish to use -## the @var{STATS} output as input to @qcode{multcompare}, then the model needs -## to be refit with the "contrast" parameter set to a sum-to-zero contrast -## coding scheme, e.g."simple". -## -## @seealso{anovan, multcompare} +## @seealso{LinearModel, CompactLinearModel, stepwiselm} ## @end deftypefn -function [T, STATS] = fitlm (X, y, varargin) +function mdl = fitlm (varargin) - ## Check input and output arguments - if (nargin < 2) - error (strcat ("fitlm usage: ""fitlm (X, y, varargin)""; ", ... - " atleast 2 input arguments required")); - endif - if (nargout > 3) - error ("fitlm: invalid number of output arguments requested"); - endif + if (nargin < 1) + print_usage (); + endif - ## Evaluate input data - [n, N] = size (X); - msg = strcat ("fitlm: do not include the intercept column", ... - " in X - it will be added automatically"); - if (iscell (X)) - if (~ iscell (X{:,1})) - if (all (X{:,1} == 1)) - error (msg) - endif - endif - else - if (all (X(:,1) == 1)) - error (msg) - endif - endif + ## ── [Step 1] Parse inputs ──────────────────────────────────────────────── + [raw_X, raw_y, var_names, cat_flags, weights, excl_mask, ... + modelspec, obs_names, resp_name, pred_names, has_intercept, raw_cols, ... + dummy_coding] = ... + __fitlm_parse_inputs__ (varargin{:}); - ## Fetch anovan options - options = varargin; - if (isempty(options)) - options{1} = "linear"; - endif + n = rows (raw_y); + p = numel (pred_names); - ## Check if MODELSPEC was provided. If not create it. - if (ischar (options{1})) - if (! ismember (lower (options{1}), {"sstype", "varnames", "contrasts", ... - "weights", "alpha", "display", "continuous", ... - "categorical", "categoricalvars", "random", "model"})) - MODELSPEC = options{1}; - options(1) = []; - CONTINUOUS = []; - else - ## If MODELSPEC is not provided, set it for an additive linear model - MODELSPEC = zeros (N + 1); - MODELSPEC(2:N+1, 1:N) = eye (N); - end - else - MODELSPEC = options{1}; - options(1) = []; - endif + ## ── [Step 2] Build design matrix ───────────────────────────────────────── + [terms_mat, coef_names, X_full, y_full, incl_mask, p_tot, has_intercept, lp_str] = ... + __fitlm_build_design__ (raw_X, raw_y, modelspec, var_names, cat_flags, ... + weights, excl_mask, pred_names, raw_cols, ... + has_intercept, dummy_coding); - ## Evaluate MODELSPEC - if (ischar (MODELSPEC)) - MODELSPEC = lower (MODELSPEC); - if (! isempty (regexp (MODELSPEC, "~"))) - error ("fitlm: model formulae are not a supported format for MODELSPEC") - endif - if (! ismember (MODELSPEC, {"constant", "linear", "interaction", ... - "interactions", "full"})) - error ("fitlm: character vector for model specification not recognised") - endif - if strcmp (MODELSPEC, "constant") - X = []; - MODELSPEC = "linear"; - N = 0; - endif - else - if (size (MODELSPEC, 1) < N + 1) - error ("fitlm: number of rows in MODELSPEC must 1 + number of columns in X"); - endif - if (size (MODELSPEC, 2) != N + 1) - error ("fitlm: number of columns in MODELSPEC must = 1 + number of columns in X"); - endif - if (! all (ismember (MODELSPEC(:), [0,1]))) - error (strcat ("fitlm: elements of the model terms matrix must be ", ... - " either 0 or 1. Higher order terms are not supported")); - endif - MODELSPEC = logical (MODELSPEC(2:N+1,1:N)); - endif + ## ── [Step 3] Call math engine on included rows only ────────────────────── + X_fit = X_full(incl_mask, :); + y_fit = y_full(incl_mask); + w_fit = weights(incl_mask); - ## Check for unsupported options used by anovan - if (any (strcmpi ("MODEL", options))) - error (strcat("fitlm: modelspec should be specified in the", ... - " third input argument of fitlm (if at all)")); - endif + warning ("off", "CompactLinearModel:rankDeficient"); + s = lm_fit_engine (X_fit, y_fit, w_fit); + warning ("on", "CompactLinearModel:rankDeficient"); - ## Check and set variable types - idx = find (any (cat (1, strcmpi ("categorical", options), ... - strcmpi ("categoricalvars", options)))); - if (! isempty (idx)) - CONTINUOUS = [1:N]; - CONTINUOUS(ismember(CONTINUOUS,options{idx+1})) = []; - options(idx:idx+1) = []; - else - CONTINUOUS = [1:N]; - endif - idx = find (strcmpi ("continuous", options)); - if (! isempty (idx)) - ## Note that setting continuous parameter will override settings made - ## to "categorical" - CONTINUOUS = options{idx+1}; - endif + n_used = s.n; - ## Check if anovan CONTRASTS option was used - idx = find (strcmpi ("contrasts", options)); - if (isempty (idx)) - CONTRASTS = "treatment"; - else - CONTRASTS = options{idx+1}; - if (ischar(CONTRASTS)) - contr_str = CONTRASTS; - CONTRASTS = cell (1, N); - CONTRASTS(:) = {contr_str}; - endif - if (! iscell (CONTRASTS)) - CONTRASTS = {CONTRASTS}; - endif - for i = 1:N - if (! isnumeric(CONTRASTS{i})) - if (! isempty (CONTRASTS{i})) - if (! ismember (CONTRASTS{i}, ... - {"simple","poly","helmert","effect","treatment"})) - error (strcat("fitlm: the choices for built-in contrasts are", ... - " 'simple', 'poly', 'helmert', 'effect', or 'treatment'")); - endif - endif - endif - endfor - endif + ## ── [Step 4] Build Formula struct ──────────────────────────────────────── + ## fromFit expects formula.CoefficientNames to be set + Formula = __fitlm_build_formula__ (terms_mat, var_names, pred_names, ... + resp_name, has_intercept, lp_str, coef_names); + Formula.CoefficientNames = coef_names; ## required by fromFit (line 288) - ## Check if anovan SSTYPE option was used - idx = find (strcmpi ("sstype", options)); - if (isempty (idx)) - SSTYPE = 2; - else - SSTYPE = options{idx+1}; - endif + ## ── [Step 5] Build VariableInfo struct ──────────────────────────────────── + VariableInfo = __fitlm_build_varinfo__ (var_names, cat_flags, raw_cols, ... + Formula.InModel); + + ## ── [Step 6] Build ObservationInfo ─────────────────────────────────────── + ObservationInfo = __fitlm_build_obsinfo__ (weights, excl_mask, raw_X, raw_y, obs_names); + + ## ── [Step 7] Build Variables struct ────────────────────────────────────── + Variables = __fitlm_build_variables_table__ (raw_cols, var_names, obs_names); - ## Perform model fit and ANOVA - [~, ~, STATS] = anovan (y, X, options{:}, ... - "model", MODELSPEC, ... - "contrasts", CONTRASTS, ... - "continuous", CONTINUOUS, ... - "sstype", SSTYPE); + ## ── [Step 8] Compute missing mask ──────────────────────────────────────── + missing_mask = ObservationInfo.Missing; - ## Create table of regression coefficients - ncoeff = sum (STATS.df); - T = cell (2 + ncoeff, 7); - T(1,:) = {"Parameter", "Estimate", "SE", ... - "Lower.CI", "Upper.CI", "t", "Prob>|t|"}; - T(2:end,1) = STATS.coeffnames; - T(2:end,2:7) = num2cell (STATS.coeffs); + ## ── [Step 9] Call LinearModel.fromFit ──────────────────────────────────── + ## + ## fromFit signature: + ## fromFit(fit_s, X, y, formula, pred_names, resp_name, + ## var_names, var_info, obs_names, obs_info, + ## weights, excluded, missing_mask, steps, variables_data) + ## + ## X and y here are the USED (included) subsets — fromFit computes diagnostics + mdl = LinearModel.fromFit ( ... + s, ... ## lm_fit_engine output + X_fit, ... ## design matrix (used rows only) + y_fit, ... ## response (used rows only) + Formula, ... ## Formula struct (with CoefficientNames field) + pred_names, ... ## predictor variable names + resp_name, ... ## response variable name + var_names, ... ## all variable names + VariableInfo, ... ## VariableInfo struct + obs_names, ... ## ObservationNames + ObservationInfo, ... ## ObservationInfo struct + weights, ... ## full n weights + excl_mask, ... ## full n exclusion mask + missing_mask, ... ## full n missing mask + [], ... ## Steps = [] for fitlm + Variables, ... ## Variables struct + X_full); ## full n design matrix for excluded-row Fitted + +endfunction - ## Update STATS structure - STATS.source = "fitlm"; +## ── Private helper — build Variables struct ─────────────────────────────────── + +function Vars = __fitlm_build_variables_table__ (raw_cols, var_names, obs_names) + ## Build Variables as a struct where each field = one column (all n rows). + Vars = struct (); + for k = 1:numel (var_names) + vn = var_names{k}; + ## Replace invalid fieldname chars + fn = regexprep (vn, '[^a-zA-Z0-9_]', '_'); + if (isempty (fn) || ! isempty (regexp (fn(1), '[^a-zA-Z_]'))) + fn = ['v_', fn]; + endif + Vars.(fn) = raw_cols{k}; + endfor endfunction -%!demo -%! y = [ 8.706 10.362 11.552 6.941 10.983 10.092 6.421 14.943 15.931 ... -%! 22.968 18.590 16.567 15.944 21.637 14.492 17.965 18.851 22.891 ... -%! 22.028 16.884 17.252 18.325 25.435 19.141 21.238 22.196 18.038 ... -%! 22.628 31.163 26.053 24.419 32.145 28.966 30.207 29.142 33.212 ... -%! 25.694 ]'; -%! X = [1 1 1 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5]'; -%! -%! [TAB,STATS] = fitlm (X,y,"linear","CategoricalVars",1,"display","on"); - -%!demo -%! popcorn = [5.5, 4.5, 3.5; 5.5, 4.5, 4.0; 6.0, 4.0, 3.0; ... -%! 6.5, 5.0, 4.0; 7.0, 5.5, 5.0; 7.0, 5.0, 4.5]; -%! brands = {'Gourmet', 'National', 'Generic'; ... -%! 'Gourmet', 'National', 'Generic'; ... -%! 'Gourmet', 'National', 'Generic'; ... -%! 'Gourmet', 'National', 'Generic'; ... -%! 'Gourmet', 'National', 'Generic'; ... -%! 'Gourmet', 'National', 'Generic'}; -%! popper = {'oil', 'oil', 'oil'; 'oil', 'oil', 'oil'; 'oil', 'oil', 'oil'; ... -%! 'air', 'air', 'air'; 'air', 'air', 'air'; 'air', 'air', 'air'}; -%! -%! [TAB, STATS] = fitlm ({brands(:),popper(:)},popcorn(:),"interactions",... -%! "CategoricalVars",[1,2],"display","on"); +## ─── UNIT TESTS ─────────────────────────────────────────────────────────────── + +## Test 1: basic matrix input returns LinearModel +%!test +%! rng (42); +%! X = randn (20, 3); +%! y = X * [1; 2; -1] + randn (20, 1) * 0.5; +%! mdl = fitlm (X, y); +%! assert (isa (mdl, 'LinearModel')); +%! assert (mdl.NumPredictors, 3); +%! assert (mdl.NumObservations, 20); +%! assert (mdl.NumCoefficients, 4); +%! assert (isempty (mdl.Steps)); +%! assert (mdl.DFE, 16); + +## Test 2: VariableNames and ResponseName for matrix input +%!test +%! rng (1); +%! X = randn (10, 2); +%! y = X * [2; -1] + randn (10, 1) * 0.3; +%! mdl = fitlm (X, y); +%! assert (isequal (mdl.VariableNames, {'x1', 'x2', 'y'})); +%! assert (strcmp (mdl.ResponseName, 'y')); +%! assert (isequal (mdl.PredictorNames, {'x1', 'x2'})); +%! assert (mdl.NumVariables, 3); + +## Test 3: VarNames override +%!test +%! rng (1); +%! X = randn (15, 2); +%! y = X * [3; -1] + randn (15, 1) * 0.5; +%! mdl = fitlm (X, y, 'VarNames', {'Height', 'Weight', 'BP'}); +%! assert (strcmp (mdl.ResponseName, 'BP')); +%! assert (isequal (mdl.PredictorNames, {'Height', 'Weight'})); +%! assert (isequal (mdl.CoefficientNames, {'(Intercept)', 'Height', 'Weight'})); + +## Test 5: modelspec 'constant' +%!test +%! rng (1); +%! X = randn (10, 3); +%! y = randn (10, 1); +%! mdl = fitlm (X, y, 'constant'); +%! assert (mdl.NumCoefficients, 1); +%! assert (isequal (mdl.CoefficientNames, {'(Intercept)'})); + +## Test 6: modelspec 'interactions' +%!test +%! rng (1); +%! X = randn (15, 2); +%! y = randn (15, 1); +%! mdl = fitlm (X, y, 'interactions'); +%! assert (mdl.NumCoefficients, 4); ## 1 + x1 + x2 + x1:x2 +%! assert (any (strcmp (mdl.CoefficientNames, 'x1:x2'))); + +## Test 7: Intercept=false +%!test +%! rng (42); +%! X = randn (20, 2); +%! y = X * [3; -1] + randn (20, 1) * 0.3; +%! mdl = fitlm (X, y, 'Intercept', false); +%! assert (mdl.NumCoefficients, 2); +%! assert (mdl.DFE, 18); +%! assert (mdl.Formula.HasIntercept, false); +%! assert (! any (strcmp (mdl.CoefficientNames, '(Intercept)'))); + +## Test 8: Exclude integer indices +%!test +%! rng (1); +%! X = randn (10, 2); +%! y = X * [2; -1] + randn (10, 1) * 0.3; +%! mdl = fitlm (X, y, 'Exclude', [3, 7]); +%! assert (mdl.NumObservations, 8); +%! assert (mdl.ObservationInfo.Excluded(3), true); +%! assert (mdl.ObservationInfo.Excluded(7), true); +%! assert (mdl.ObservationInfo.Subset(3), false); +%! assert (isnan (mdl.Residuals.Raw(3))); +%! assert (isnan (mdl.Residuals.Raw(7))); + +## Test 9: Exclude logical vector gives same coefficients as integer +%!test +%! rng (1); +%! X = randn (10, 2); +%! y = X * [2; -1] + randn (10, 1) * 0.3; +%! mdl_int = fitlm (X, y, 'Exclude', [3, 7]); +%! excl_log = false (10, 1); excl_log([3,7]) = true; +%! mdl_log = fitlm (X, y, 'Exclude', excl_log); +%! assert (mdl_int.Coefficients.Estimate, mdl_log.Coefficients.Estimate, 1e-12); + +## Test 10: NaN in X → Missing flag, Fitted = NaN +%!test +%! rng (1); +%! X = randn (10, 2); +%! y = X * [2; -1] + randn (10, 1) * 0.3; +%! X(5, 1) = NaN; +%! mdl = fitlm (X, y); +%! assert (mdl.NumObservations, 9); +%! assert (mdl.ObservationInfo.Missing(5), true); +%! assert (isnan (mdl.Fitted(5))); +%! assert (isnan (mdl.Residuals.Raw(5))); + +## Test 11: Weighted regression +%!test +%! rng (42); +%! X = randn (15, 2); +%! y = X * [3; -1] + randn (15, 1) * 0.5; +%! w = abs (randn (15, 1)) + 0.5; +%! mdl_w = fitlm (X, y, 'Weights', w); +%! mdl_u = fitlm (X, y); +%! assert (mdl_w.DFE, mdl_u.DFE); +%! assert (mdl_w.NumObservations, 15); +%! assert (mdl_w.ObservationInfo.Weights, w, 1e-14); + +## Test 12: CategoricalVars on numeric matrix +%!test +%! rng (42); +%! X = [randn(30, 1), repmat([1;2;3], 10, 1), randn(30, 1)]; +%! y = X(:,1)*2 + (X(:,2)==2)*1.5 + randn(30,1)*0.3; +%! mdl = fitlm (X, y, 'CategoricalVars', 2); +%! assert (mdl.VariableInfo.IsCategorical(2), true); +%! assert (mdl.NumCoefficients, 5); ## intercept + x1 + x2_2 + x2_3 + x3 +%! assert (mdl.NumPredictors, 3); +%! assert (any (strcmp (mdl.CoefficientNames, 'x2_2'))); +%! assert (any (strcmp (mdl.CoefficientNames, 'x2_3'))); + +## Test 13: compact() preserves key fields +%!test +%! rng (1); +%! X = randn (15, 2); +%! y = X * [2; -1] + randn (15, 1) * 0.3; +%! mdl = fitlm (X, y); +%! cmdl = compact (mdl); +%! assert (isa (cmdl, 'CompactLinearModel')); +%! assert (! isa (cmdl, 'LinearModel')); +%! assert (cmdl.MSE, mdl.MSE, 1e-10); +%! assert (cmdl.RMSE, mdl.RMSE, 1e-10); +%! assert (cmdl.DFE, mdl.DFE); + +## Test 14: rank-deficient — NumEstimatedCoefficients < NumCoefficients +%!test +%! warning ('off', 'CompactLinearModel:rankDeficient'); +%! rng (1); +%! X_base = randn (20, 2); +%! X_rd = [X_base, X_base(:,1) + X_base(:,2)]; +%! y = X_base * [2; -1] + randn (20, 1) * 0.3; +%! mdl = fitlm (X_rd, y); +%! assert (mdl.NumCoefficients, 4); +%! assert (mdl.NumEstimatedCoefficients, 3); +%! warning ('on', 'CompactLinearModel:rankDeficient'); + +## Test 15: Steps is [] for fitlm models %!test -%! y = [ 8.706 10.362 11.552 6.941 10.983 10.092 6.421 14.943 15.931 ... -%! 22.968 18.590 16.567 15.944 21.637 14.492 17.965 18.851 22.891 ... -%! 22.028 16.884 17.252 18.325 25.435 19.141 21.238 22.196 18.038 ... -%! 22.628 31.163 26.053 24.419 32.145 28.966 30.207 29.142 33.212 ... -%! 25.694 ]'; -%! X = [1 1 1 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5]'; -%! [TAB,STATS] = fitlm (X,y,"continuous",[],"display","off"); -%! [TAB,STATS] = fitlm (X,y,"CategoricalVars",1,"display","off"); -%! [TAB,STATS] = fitlm (X,y,"constant","categorical",1,"display","off"); -%! [TAB,STATS] = fitlm (X,y,"linear","categorical",1,"display","off"); -%! [TAB,STATS] = fitlm (X,y,[0,0;1,0],"categorical",1,"display","off"); -%! assert (TAB{2,2}, 10, 1e-04); -%! assert (TAB{3,2}, 7.99999999999999, 1e-09); -%! assert (TAB{4,2}, 8.99999999999999, 1e-09); -%! assert (TAB{5,2}, 11.0001428571429, 1e-09); -%! assert (TAB{6,2}, 19.0001111111111, 1e-09); -%! assert (TAB{2,3}, 1.01775379540949, 1e-09); -%! assert (TAB{3,3}, 1.64107868458008, 1e-09); -%! assert (TAB{4,3}, 1.43932122062479, 1e-09); -%! assert (TAB{5,3}, 1.48983900477565, 1e-09); -%! assert (TAB{6,3}, 1.3987687997822, 1e-09); -%! assert (TAB{2,6}, 9.82555903510687, 1e-09); -%! assert (TAB{3,6}, 4.87484242844031, 1e-09); -%! assert (TAB{4,6}, 6.25294748040552, 1e-09); -%! assert (TAB{5,6}, 7.38344399756088, 1e-09); -%! assert (TAB{6,6}, 13.5834536158296, 1e-09); -%! assert (TAB{3,7}, 2.85812420217862e-05, 1e-12); -%! assert (TAB{4,7}, 5.22936741204002e-07, 1e-06); -%! assert (TAB{5,7}, 2.12794763209106e-08, 1e-07); -%! assert (TAB{6,7}, 7.82091664406755e-15, 1e-08); +%! rng (1); +%! X = randn (10, 2); +%! y = randn (10, 1); +%! mdl = fitlm (X, y); +%! assert (isempty (mdl.Steps)); +## Test 17: Wilkinson formula string input %!test -%! popcorn = [5.5, 4.5, 3.5; 5.5, 4.5, 4.0; 6.0, 4.0, 3.0; ... -%! 6.5, 5.0, 4.0; 7.0, 5.5, 5.0; 7.0, 5.0, 4.5]; -%! brands = bsxfun (@times, ones(6,1), [1,2,3]); -%! popper = bsxfun (@times, [1;1;1;2;2;2], ones(1,3)); -%! -%! [TAB, STATS] = fitlm ({brands(:),popper(:)},popcorn(:),"interactions",... -%! "categoricalvars",[1,2],"display","off"); -%! assert (TAB{2,2}, 5.66666666666667, 1e-09); -%! assert (TAB{3,2}, -1.33333333333333, 1e-09); -%! assert (TAB{4,2}, -2.16666666666667, 1e-09); -%! assert (TAB{5,2}, 1.16666666666667, 1e-09); -%! assert (TAB{6,2}, -0.333333333333334, 1e-09); -%! assert (TAB{7,2}, -0.166666666666667, 1e-09); -%! assert (TAB{2,3}, 0.215165741455965, 1e-09); -%! assert (TAB{3,3}, 0.304290309725089, 1e-09); -%! assert (TAB{4,3}, 0.304290309725089, 1e-09); -%! assert (TAB{5,3}, 0.304290309725089, 1e-09); -%! assert (TAB{6,3}, 0.43033148291193, 1e-09); -%! assert (TAB{7,3}, 0.43033148291193, 1e-09); -%! assert (TAB{2,6}, 26.3362867542108, 1e-09); -%! assert (TAB{3,6}, -4.38178046004138, 1e-09); -%! assert (TAB{4,6}, -7.12039324756724, 1e-09); -%! assert (TAB{5,6}, 3.83405790253621, 1e-09); -%! assert (TAB{6,6}, -0.774596669241495, 1e-09); -%! assert (TAB{7,6}, -0.387298334620748, 1e-09); -%! assert (TAB{2,7}, 5.49841502258254e-12, 1e-09); -%! assert (TAB{3,7}, 0.000893505495903642, 1e-09); -%! assert (TAB{4,7}, 1.21291454302428e-05, 1e-09); -%! assert (TAB{5,7}, 0.00237798044119407, 1e-09); -%! assert (TAB{6,7}, 0.453570536021938, 1e-09); -%! assert (TAB{7,7}, 0.705316781644046, 1e-09); -%! ## Test with string ids for categorical variables -%! brands = {'Gourmet', 'National', 'Generic'; ... -%! 'Gourmet', 'National', 'Generic'; ... -%! 'Gourmet', 'National', 'Generic'; ... -%! 'Gourmet', 'National', 'Generic'; ... -%! 'Gourmet', 'National', 'Generic'; ... -%! 'Gourmet', 'National', 'Generic'}; -%! popper = {'oil', 'oil', 'oil'; 'oil', 'oil', 'oil'; 'oil', 'oil', 'oil'; ... -%! 'air', 'air', 'air'; 'air', 'air', 'air'; 'air', 'air', 'air'}; -%! [TAB, STATS] = fitlm ({brands(:),popper(:)},popcorn(:),"interactions",... -%! "categoricalvars",[1,2],"display","off"); +%! rng (1); +%! X = randn (20, 2); +%! y = X * [1; 2] + randn (20, 1) * 0.5; +%! mdl = fitlm (X, y, 'y ~ x1 + x2'); +%! assert (isa (mdl, 'LinearModel')); +%! assert (mdl.Formula.HasIntercept, true); +%! assert (strcmp (mdl.Formula.ResponseName, 'y')); +## Test 18: anova() still works on fitlm output %!test -%! load carsmall -%! X = [Weight,Horsepower,Acceleration]; -%! [TAB, STATS] = fitlm (X, MPG,"constant","display","off"); -%! [TAB, STATS] = fitlm (X, MPG,"linear","display","off"); -%! assert (TAB{2,2}, 47.9767628118615, 1e-09); -%! assert (TAB{3,2}, -0.00654155878851796, 1e-09); -%! assert (TAB{4,2}, -0.0429433065881864, 1e-09); -%! assert (TAB{5,2}, -0.0115826516894871, 1e-09); -%! assert (TAB{2,3}, 3.87851641748551, 1e-09); -%! assert (TAB{3,3}, 0.00112741016370336, 1e-09); -%! assert (TAB{4,3}, 0.0243130608813806, 1e-09); -%! assert (TAB{5,3}, 0.193325043113178, 1e-09); -%! assert (TAB{2,6}, 12.369874881944, 1e-09); -%! assert (TAB{3,6}, -5.80228828790225, 1e-09); -%! assert (TAB{4,6}, -1.76626492228599, 1e-09); -%! assert (TAB{5,6}, -0.0599128364487485, 1e-09); -%! assert (TAB{2,7}, 4.89570341688996e-21, 1e-09); -%! assert (TAB{3,7}, 9.87424814144e-08, 1e-09); -%! assert (TAB{4,7}, 0.0807803098213114, 1e-09); -%! assert (TAB{5,7}, 0.952359384151778, 1e-09); +%! rng (1); +%! X = randn (20, 2); +%! y = X * [1; 2] + randn (20, 1) * 0.5; +%! mdl = fitlm (X, y); +%! T = anova (mdl); +%! assert (isstruct (T)); diff --git a/inst/grp2idx.m b/inst/grp2idx.m index ae7ed87b..ed376491 100644 --- a/inst/grp2idx.m +++ b/inst/grp2idx.m @@ -1,5 +1,6 @@ ## Copyright (C) 2015 Carnë Draug ## Copyright (C) 2022-2026 Andreas Bertsatos +## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com> ## ## This file is part of the statistics package for GNU Octave. ## @@ -67,6 +68,20 @@ if (nargin != 1) print_usage (); endif + + ## Handle empty input immediately + if (isempty (s)) + if (! isvector (s) && ! ischar (s) && ! isstring (s) && ! iscellstr (s) && ! iscategorical (s)) + error ("grp2idx: Grouping variable must be a vector, character array, string array, or cell array of character vectors."); + endif + g = []; + gn = {}; + if (nargout > 2) + gl = []; + endif + return; + endif + if (ndims (s) != 2) error ("grp2idx: S must be either a vector or a matrix."); endif @@ -82,10 +97,36 @@ error ("grp2idx: 'categorical' grouping variable must be a vector."); endif is_categorical = true; - undef = isundefined (s); cats = categories (s); - s = cellstr (s); - s(undef) = {""}; + s_text = cellstr (s(:)); + g = NaN (numel (s_text), 1); + + if (nargout > 1) + if (isempty (cats)) + gn = cell (0, 1); + else + gn = cellstr (cats); + endif + endif + + if (nargout > 2) + if (isempty (cats)) + gl = categorical (cell (0, 1)); + else + gl = categorical (cellstr (cats), cellstr (cats), cellstr (cats)); + endif + endif + + if (isempty (cats)) + return; + endif + + cat_names = cellstr (cats); + undef = isundefined (s(:)); + for i = 1:numel (cat_names) + g(! undef & strcmp (s_text, cat_names{i})) = i; + endfor + return; elseif (ischar (s)) is_char_array = true; s = cellstr (s); @@ -121,16 +162,7 @@ [gl, I, g] = unique (s(:)); ## Fix order in here, since unique does not support this yet if (iscellstr (s) && ! is_categorical) - I = sort (I); - for i = 1:length (gl) - gl_s(i) = gl(g(I(i))); - idx(i,:) = (g == g(I(i))); - endfor - for i = 1:length (gl) - g(idx(i,:)) = i; - endfor - gl = gl_s; - gl = gl'; + [g, gl] = g2i_reindex_stable (g, gl, I); endif ## handle NaNs and empty strings @@ -208,6 +240,24 @@ endfunction +function [g, gl] = g2i_reindex_stable (g, gl, first_idx) + + first_idx = sort (first_idx(:)); + ng = numel (gl); + new_gl = cell (ng, 1); + new_g = g; + + for i = 1:ng + old_idx = g(first_idx(i)); + new_gl{i} = gl{old_idx}; + new_g(g == old_idx) = i; + endfor + + gl = new_gl; + g = new_g; + +endfunction + # test for one output argument %!test %! g = grp2idx ([3 2 1 2 3 1]); @@ -390,6 +440,13 @@ %! assert (iscategorical (gl), true); %! assert (cellstr (gl), gn); +%!test +%! s = categorical ({'a'; 'b'; 'a'}, {'b', 'a', 'c'}, {'b', 'a', 'c'}); +%! [g, gn, gl] = grp2idx (s); +%! assert (g, [2; 1; 2]); +%! assert (gn, {'b'; 'a'; 'c'}); +%! assert (cellstr (gl), gn); + ## test for duration arrays %!test %! g = gn = gl = []; diff --git a/inst/parseWilkinsonFormula.m b/inst/parseWilkinsonFormula.m index 703c260c..f0a3d6fe 100644 --- a/inst/parseWilkinsonFormula.m +++ b/inst/parseWilkinsonFormula.m @@ -1,5 +1,6 @@ ## Copyright (C) 2026 Andreas Bertsatos ## Copyright (C) 2026 Avanish Salunke +## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com> ## ## This file is part of the statistics package for GNU Octave. ## @@ -201,7 +202,7 @@ else lhs_vars = resolve_lhs_symbolic (lhs_str); endif - + ## build the required output. varargout{1} = run_equation_builder (lhs_vars, rhs_terms); @@ -519,14 +520,14 @@ args_str_parts = {}; for k = 1:length (node.args) arg_res = run_expander (node.args{k}, mode); - + if (! isempty (arg_res) && ! isempty (arg_res{1})) - args_str_parts{end+1} = arg_res{1}{1}; + args_str_parts{end+1} = arg_res{1}{1}; else args_str_parts{end+1} = ''; endif endfor - + full_term = sprintf ("%s(%s)", node.name, strjoin (args_str_parts, ',')); result = {{full_term}}; else @@ -774,11 +775,8 @@ endif endfunction - ## extract variables + ## extract variables — collect RHS predictors first, then LHS response all_vars = {}; - if (! isempty (lhs_term)) - all_vars = [all_vars, flatten_recursive(lhs_term)]; - endif cleaned_rhs = cell (length (rhs_terms), 1); for i = 1:length (rhs_terms) @@ -789,11 +787,26 @@ final_term_vars = [final_term_vars, parts]; endfor cleaned_rhs{i} = final_term_vars; - all_vars = [all_vars, final_term_vars]; + ## Add only the BASE variable name to all_vars (strip ^k suffix) + for j = 1:length (final_term_vars) + tv = final_term_vars{j}; + caret = strfind (tv, '^'); + if (! isempty (caret)) + all_vars{end+1} = tv(1:caret(1)-1); + else + all_vars{end+1} = tv; + endif + endfor endfor - all_vars = unique (all_vars); - ## Remove intercept marker from var list + ## Append LHS (response) variables after RHS predictors + if (! isempty (lhs_term)) + all_vars = [all_vars, flatten_recursive(lhs_term)]; + endif + + ## Stable dedup (preserves first-seen order) + [~, ui] = unique (all_vars, 'stable'); + all_vars = all_vars(sort (ui)); all_vars(strcmp (all_vars, '1')) = []; schema.VariableNames = all_vars; @@ -808,7 +821,7 @@ endif endif - ## Build terms matrix + ## Build terms matrix (stores exponents, not just binary 1) n_vars = length (all_vars); n_terms = length (cleaned_rhs); terms_mat = zeros (n_terms, n_vars); @@ -822,22 +835,46 @@ continue; endif - [found, idx] = ismember (vars_in_this_term, all_vars); - if (any (! found)) - error ("parseWilkinsonFormula: Unknown variable in term definition."); - endif - terms_mat(i, idx) = 1; + for jj = 1:length (vars_in_this_term) + tv = vars_in_this_term{jj}; + caret = strfind (tv, '^'); + if (! isempty (caret)) + base_name = tv(1:caret(1)-1); + exp_val = str2double (tv(caret(1)+1:end)); + else + base_name = tv; + exp_val = 1; + endif + col = find (strcmp (all_vars, base_name)); + if (isempty (col)) + error ("parseWilkinsonFormula: Unknown variable '%s' in term.", ... + base_name); + endif + terms_mat(i, col) = exp_val; + endfor endfor - ## sorting : order by order. - term_orders = sum (terms_mat, 2); - M = [term_orders, terms_mat]; + ## sorting: order by (1) number of active variables ascending, + ## (2) binary mask descending (x1 before x2 before x3), + ## (3) exponent values ascending (x before x^2) + term_orders = sum (terms_mat > 0, 2); + binary_mask = double (terms_mat > 0); + M = [term_orders, binary_mask, terms_mat]; - [~, unique_idx] = unique (M, 'rows'); + ## Create unique rows based on actual terms + [~, unique_idx] = unique ([term_orders, terms_mat], 'rows'); terms_mat = terms_mat (unique_idx, :); + cleaned_rhs = cleaned_rhs(unique_idx); + M = M (unique_idx, :); + + n_v = size (terms_mat, 2); + ## Direction: [asc on order, desc on binary mask, asc on exponents] + sort_dirs = [1, -(2:1+n_v), (2+n_v:1+2*n_v)]; - [~, sort_idx] = sortrows ([sum(terms_mat, 2), terms_mat]); + ## Sort using the direction vector + [~, sort_idx] = sortrows (M, sort_dirs); schema.Terms = terms_mat (sort_idx, :); + schema.TermTokens = cleaned_rhs(sort_idx); endfunction @@ -921,6 +958,19 @@ endif endfor + ## Reorder schema.VariableNames to match table column order (Fix B4) + tbl_col_order = data.Properties.VariableNames; + [~, pos] = ismember (schema.VariableNames, tbl_col_order); + pos(pos == 0) = length (tbl_col_order) + 1; + [~, reorder_idx] = sort (pos); + schema.VariableNames = schema.VariableNames(reorder_idx); + schema.Terms = schema.Terms(:, reorder_idx); + if (! isempty (schema.ResponseIdx)) + resp_name = req_vars{schema.ResponseIdx}; + schema.ResponseIdx = find (strcmp (schema.VariableNames, resp_name)); + endif + req_vars = schema.VariableNames; + ## Build Design Matrix X X = []; col_names = {}; @@ -930,6 +980,7 @@ has_intercept = ! isempty (intercept_row_idx); n_terms = size (schema.Terms, 1); + cleaned_rhs_terms = schema.TermTokens; for i = 1:n_terms term_row = schema.Terms(i, :); @@ -942,20 +993,53 @@ continue; endif + ## Sort vars_idx: numeric variables before categorical (Fix B5) + if (length (vars_idx) > 1) + is_cat = arrayfun (@(v) strcmp (var_info.(req_vars{v}).type, ... + 'categorical'), vars_idx); + [~, sort_order] = sort (is_cat); + vars_idx = vars_idx(sort_order); + endif + current_block = ones (n_rows, 1); current_names = {''}; + ## Resolve term tokens for polynomial support (Fix B3) + term_tokens = cleaned_rhs_terms{i}; + token_idx = 0; + for v = vars_idx - vname = req_vars{v}; + token_idx = token_idx + 1; + if (! isempty (term_tokens) && token_idx <= length (term_tokens)) + term_token = term_tokens{token_idx}; + else + term_token = req_vars{v}; + endif + + caret = strfind (term_token, '^'); + if (! isempty (caret)) + vname = term_token(1:caret(1)-1); + else + vname = req_vars{v}; + endif info = var_info.(vname); if (strcmp (info.type, 'numeric')) - current_block = current_block .* info.data; + ## Resolve polynomial exponent from term token + if (! isempty (caret)) + exp_val = str2double (term_token(caret(1)+1:end)); + col_data = info.data .^ exp_val; + disp_name = term_token; + else + col_data = info.data; + disp_name = vname; + endif + current_block = current_block .* col_data; for k = 1:length (current_names) if (isempty (current_names{k})) - current_names{k} = vname; + current_names{k} = disp_name; else - current_names{k} = [current_names{k}, ':', vname]; + current_names{k} = [current_names{k}, ':', disp_name]; endif endfor else @@ -1140,14 +1224,14 @@ ## process RHS rhs_tokens = run_lexer (rhs_str); [rhs_tree, ~] = run_parser (rhs_tokens); - + wrapper.type = 'OPERATOR'; wrapper.value = '~'; wrapper.left = []; wrapper.right = rhs_tree; - + expanded = run_expander (wrapper, mode); - + ## extract the terms. if (isstruct (expanded) && isfield (expanded, 'model')) rhs_terms = expanded.model; @@ -1164,22 +1248,22 @@ for i = 1:length (parts) p = strtrim (parts{i}); if (isempty (p)), continue; endif - + range_parts = strsplit (p, '-'); - + if (length (range_parts) == 2) s_str = strtrim (range_parts{1}); e_str = strtrim (range_parts{2}); - + [s_tok] = regexp (s_str, '^([a-zA-Z_]\w*)(\d+)$', 'tokens'); [e_tok] = regexp (e_str, '^([a-zA-Z_]\w*)(\d+)$', 'tokens'); - + if (! isempty (s_tok) && ! isempty (e_tok)) prefix = s_tok{1}{1}; s_num = str2double (s_tok{1}{2}); e_prefix = e_tok{1}{1}; e_num = str2double (e_tok{1}{2}); - + if (strcmp (prefix, e_prefix) && s_num <= e_num) for n = s_num:e_num vars{end+1} = sprintf ("%s%d", prefix, n); @@ -1203,7 +1287,7 @@ for i = 1:length (rhs_terms) t = rhs_terms{i}; if (isempty (t)) - term_strs{end+1} = ''; + term_strs{end+1} = ''; else if (length (t) == 1 && any (strfind (t{1}, "("))) term_strs{end+1} = t{1}; @@ -1229,13 +1313,13 @@ rhs_parts{end+1} = sprintf ("%s*%s", coeff, t_str); endif endfor - + full_rhs = strjoin (rhs_parts, ' + '); if (isempty (full_rhs)), full_rhs = '0'; endif lines{end+1} = sprintf ("%s = %s", lhs_vars{k}, full_rhs); endfor - eq_list = string (lines'); + eq_list = string (lines'); endfunction %!demo @@ -1271,7 +1355,7 @@ %!demo %! -%! ## Interaction Effects : +%! ## Interaction Effects : %! ## We analyze Relief Score based on Drug Type and Dosage Level. %! ## The '*' operator expands to the main effects PLUS the interaction term. %! ## Categorical variables are automatically created. @@ -1287,11 +1371,11 @@ %!demo %! -%! ## Polynomial Regression : +%! ## Polynomial Regression : %! ## Uses the power operator (^) to model non-linear relationships. %! Distance = [20; 45; 80; 125]; %! Speed = [30; 50; 70; 90]; -%! Speed_2 = Speed .^ 2; +%! Speed_2 = Speed .^ 2; %! t = table (Distance, Speed, Speed_2, 'VariableNames', {'Distance', 'Speed', 'Speed^2'}); %! %! formula = 'Distance ~ Speed^2'; @@ -1316,7 +1400,7 @@ %!demo %! -%! ## Explicit Nesting : +%! ## Explicit Nesting : %! ## The parser also supports the explicit 'B(A)' syntax, which means %! ## 'B is nested within A'. This is equivalent to the interaction 'A:B' %! ## but often used to denote random effects or specific hierarchy. @@ -1327,7 +1411,7 @@ %!demo %! -%! ## Excluding Terms : +%! ## Excluding Terms : %! ## Demonstrates building a complex model and then simplifying it. %! ## We define a full 3-way interaction (A*B*C) but explicitly remove the %! ## three-way term (A:B:C) using the minus operator. @@ -1338,7 +1422,7 @@ %!demo %! -%! ## Repeated Measures : +%! ## Repeated Measures : %! ## This allows predicting multiple outcomes simultaneously. %! ## The range operator '-' selects all variables between 'T1' and 'T3' %! ## as the response matrix Y. @@ -1487,7 +1571,7 @@ %! C = {'lo'; 'hi'}; %! d = table (y, N, C); %! [M, ~, names] = parseWilkinsonFormula ('~ N * C', 'model_matrix', d); -%! assert (any (strcmp (names, 'C_lo:N'))); +%! assert (any (strcmp (names, 'N:C_lo'))); %!test %! ## Test : Intercept Only Model %! y = [1; 2; 3]; @@ -1659,6 +1743,48 @@ %! eq = parseWilkinsonFormula ('y ~ A - A', 'equation'); %! expected = string('y = c1'); %! assert (isequal (eq, expected)); +%!test +%! ## Verify parseWilkinsonFormula schema matches MATLAB fitlm sorting +%! formula = 'Y ~ x1 * x2 * x3'; +%! schema = parseWilkinsonFormula(formula, 'matrix'); +%! +%! ## Columns: predictors first in formula order, response last +%! expected_terms = [0, 0, 0, 0; ## (Intercept) +%! 1, 0, 0, 0; ## x1 +%! 0, 1, 0, 0; ## x2 +%! 0, 0, 1, 0; ## x3 +%! 1, 1, 0, 0; ## x1:x2 +%! 1, 0, 1, 0; ## x1:x3 +%! 0, 1, 1, 0; ## x2:x3 +%! 1, 1, 1, 0]; ## x1:x2:x3 +%! +%! assert(schema.VariableNames, {'x1', 'x2', 'x3', 'Y'}); +%! assert(schema.Terms, expected_terms); +%!test +%! ## polynomial x^2 produces base var with exponent in terms matrix +%! s = parseWilkinsonFormula ('y ~ x^2', 'matrix'); +%! assert (s.VariableNames, {'x', 'y'}); +%! assert (s.Terms, [0 0; 1 0; 2 0]); +%!test +%! ## polynomial model_matrix computes x.^2 column +%! x = (1:5)'; y = x .^ 2; +%! t = table (x, y); +%! [X, ~, names] = parseWilkinsonFormula ('y ~ x^2', 'model_matrix', t); +%! assert (names, {'(Intercept)'; 'x'; 'x^2'}); +%! assert (X(:,2), x); +%! assert (X(:,3), x .^ 2); +%!test +%! ## variable ordering follows table column order +%! Age = [25;30]; Weight = [70;75]; BP = [120;122]; +%! t = table (Age, Weight, BP); +%! [~, ~, n] = parseWilkinsonFormula ('BP ~ Age * Weight', 'model_matrix', t); +%! assert (n, {'(Intercept)'; 'Age'; 'Weight'; 'Age:Weight'}); +%!test +%! ## numeric columns named before categorical in interactions +%! N = [10;20]; C = {'lo';'hi'}; y = [1;2]; +%! d = table (N, C, y); +%! [~, ~, n] = parseWilkinsonFormula ('y ~ N * C', 'model_matrix', d); +%! assert (n, {'(Intercept)'; 'N'; 'C_lo'; 'N:C_lo'}); %!error parseWilkinsonFormula () %!error parseWilkinsonFormula ('y ~ x', 'invalid_mode') %!error parseWilkinsonFormula ('', 'parse') diff --git a/inst/private/__fitlm_build_design__.m b/inst/private/__fitlm_build_design__.m new file mode 100644 index 00000000..08610bed --- /dev/null +++ b/inst/private/__fitlm_build_design__.m @@ -0,0 +1,754 @@ +## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com> +## +## This file is part of the statistics package for GNU Octave. +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Private Function} {[@var{terms_mat}, @var{coef_names}, @var{X_full}, @ +## @var{y_full}, @var{incl_mask}, @var{p_tot}, @var{has_intercept}, @ +## @var{lp_str}] =} __fitlm_build_design__ (@var{raw_X}, @var{raw_y}, @ +## @var{modelspec}, @var{var_names}, @var{cat_flags}, @var{weights}, @ +## @var{excl_mask}, @var{pred_names}, @var{raw_cols}, @var{has_intercept}) +## +## Build design matrix and terms matrix from parsed fitlm inputs. +## +## Returns: +## terms_mat - (n_terms x n_all_vars) binary terms matrix (last col = response = 0) +## coef_names - 1 x p_tot cell of char coefficient names +## X_full - n x p_tot design matrix (ALL rows, NaN for missing rows) +## y_full - n x 1 response (NaN for missing rows) +## incl_mask - n x 1 logical: rows used in fit +## p_tot - total columns in design matrix +## has_intercept - logical +## lp_str - LinearPredictor RHS string +## @end deftypefn + +function [terms_mat, coef_names, X_full, y_full, incl_mask, p_tot, has_intercept, lp_str] = ... + __fitlm_build_design__ (raw_X, raw_y, modelspec, var_names, cat_flags, ... + weights, excl_mask, pred_names, raw_cols, ... + has_intercept, dummy_coding) + + if (nargin < 11 || isempty (dummy_coding)) + dummy_coding = "reference"; + endif + + n = rows (raw_X); + p = columns (raw_X); ## number of predictor VARIABLES + n_vars = p + 1; ## predictors + response + + ## ── Step 1: Resolve shorthand → Terms matrix ──────────────────────────── + if (isnumeric (modelspec) && ismatrix (modelspec) && ndims (modelspec) == 2) + ## Numeric terms matrix passed directly (e.g., from stepwiselm) + terms_pred = modelspec; + ## If width = p+1 (includes response column), strip last column + if (columns (terms_pred) == n_vars) + terms_pred = terms_pred(:, 1:p); + endif + ## Detect intercept from the terms matrix + has_intercept = any (all (terms_pred == 0, 2)); + else + is_formula = ! isempty (strfind (modelspec, "~")); + + if (is_formula) + ## Formula string — parse response name and RHS + tilde = strfind (modelspec, "~"); + rhs = strtrim (modelspec(tilde(1)+1:end)); + ## Check for unsupported C() notation + if (! isempty (regexp (rhs, 'C\s*\('))) + error ("Unable to understand the character vector or string scalar '%s'.", modelspec); + endif + ## Check for intercept suppression + if (! isempty (regexp (rhs, '(^|\s|[+])\s*-\s*1\b'))) + has_intercept = false; + rhs = regexprep (rhs, '\s*[-]\s*1\b', ''); + rhs = strtrim (rhs); + endif + ## Parse the RHS terms + terms_pred = __parse_rhs_terms__ (rhs, pred_names, has_intercept); + else + ## Shorthand string + terms_pred = __shorthand_to_terms__ (lower (modelspec), p, has_intercept); + endif + endif + + + ## terms_pred: (n_terms x p) — predictor columns only + ## Expand to include response column (always 0) + terms_mat = [terms_pred, zeros(rows(terms_pred), 1)]; + + ## ── Step 2: Build design matrix column by column ───────────────────────── + ## + ## We iterate over term rows. For each term: + ## - intercept row (all 0): add ones column + ## - linear/squared numeric: add X^power column + ## - categorical main effect: add k-1 dummy columns + ## - interaction: element-wise product of factor block columns + + ## Pre-compute per-predictor "base columns" (for numerics) and + ## "dummy block" (for categoricals) + base_cols = cell (1, p); ## each cell = n x something + base_cnames = cell (1, p); ## cell of cell of char + cat_levels = cell (1, p); ## sorted unique levels for each cat predictor + + for k = 1:p + if (cat_flags(k)) + ## Categorical predictor — build dummy columns (reference = first level) + col_raw = raw_cols{k}; + if (isa (col_raw, "categorical")) + levs = cellstr (categories (col_raw)); + [~, cat_idx] = ismember (cellstr (col_raw), levs); + elseif (iscellstr (col_raw) || isstring (col_raw)) + if (isstring (col_raw)) + col_raw = cellstr (col_raw); + endif + levs = unique (col_raw); + levs = sort (levs); + [~, ~, cat_idx] = unique (col_raw); + ## Re-sort so idx matches sorted levs + [levs2, ~, cat_idx] = unique (col_raw); + levs = levs2; + else + ## Numeric forced categorical + vals = raw_X(:, k); + levs_num = unique (vals(! isnan (vals))); + levs_num = sort (levs_num); + levs = arrayfun (@num2str, levs_num, "UniformOutput", false); + cat_idx = zeros (n, 1); + for li = 1:numel (levs_num) + cat_idx(vals == levs_num(li)) = li; + endfor + endif + cat_levels{k} = levs; + n_lev = numel (levs); + ## Dummy coding: 'reference' drops first level, 'full' keeps all + if (strcmp (dummy_coding, "full")) + n_dum = n_lev; + D = zeros (n, n_dum); + dum_names = cell (1, n_dum); + for L = 1:n_lev + D(:, L) = double (cat_idx == L); + if (isnumeric (levs)) + lev_str = num2str (levs(L)); + else + lev_str = levs{L}; + endif + dum_names{L} = sprintf ("%s_%s", pred_names{k}, lev_str); + endfor + else + ## Reference coding: drop first level + n_dum = n_lev - 1; + D = zeros (n, n_dum); + dum_names = cell (1, n_dum); + for L = 2:n_lev + D(:, L-1) = double (cat_idx == L); + if (isnumeric (levs)) + lev_str = num2str (levs(L)); + else + lev_str = levs{L}; + endif + dum_names{L-1} = sprintf ("%s_%s", pred_names{k}, lev_str); + endfor + endif + base_cols{k} = D; + base_cnames{k} = dum_names; + else + ## Numeric predictor — single column; power applied per-term + base_cols{k} = raw_X(:, k); + base_cnames{k} = {pred_names{k}}; + endif + endfor + + ## Now build X_full column by column from terms_pred + X_full = []; + coef_names = {}; + + for i = 1:rows (terms_pred) + row = terms_pred(i, :); + + if (all (row == 0)) + ## Intercept + X_full = [X_full, ones(n, 1)]; + coef_names{end+1} = "(Intercept)"; + continue; + endif + + ## Find active predictors in this term + active = find (row); + ## Start with a block of ones × empty name + blk = ones (n, 1); + blk_n = {""}; + + for ai = 1:numel (active) + v = active(ai); + pwr = row(v); + + if (cat_flags(v)) + ## Categorical: Cartesian product with dummy block + D = base_cols{v}; ## n x (n_lev-1) + dn = base_cnames{v}; ## 1 x (n_lev-1) cell + new_blk = []; + new_blk_n = {}; + for c1 = 1:columns (blk) + for c2 = 1:columns (D) + new_blk = [new_blk, blk(:, c1) .* D(:, c2)]; + n1 = blk_n{c1}; + n2 = dn{c2}; + if (isempty (n1)) + new_blk_n{end+1} = n2; + else + new_blk_n{end+1} = [n1, ":", n2]; + endif + endfor + endfor + blk = new_blk; + blk_n = new_blk_n; + else + ## Numeric: multiply by column^power + col = base_cols{v}; + if (pwr > 1) + col = col .^ pwr; + cname = sprintf ("%s^%d", pred_names{v}, pwr); + else + cname = pred_names{v}; + endif + blk = bsxfun (@times, blk, col); + for c1 = 1:numel (blk_n) + if (isempty (blk_n{c1})) + blk_n{c1} = cname; + else + blk_n{c1} = [blk_n{c1}, ":", cname]; + endif + endfor + endif + endfor + + X_full = [X_full, blk]; + coef_names = [coef_names, blk_n]; + endfor + + ## ── Step 3: Compute incl_mask ─────────────────────────────────────────── + ## Missing: any NaN in the NUMERIC data (raw_X numeric cols or raw_y) + nan_in_X = any (isnan (raw_X), 2); + nan_in_y = isnan (raw_y); + ## For categorical columns stored as codes, NaN means missing + missing_mask = nan_in_X | nan_in_y; + + incl_mask = (! excl_mask) & (! missing_mask); + y_full = raw_y; + + p_tot = columns (X_full); + + ## ── Step 4: Build LinearPredictor string ──────────────────────────────── + lp_str = __build_lp_str__ (terms_pred, pred_names, has_intercept); + +endfunction + + +## ── Shorthand → terms_pred (n_terms x p) ──────────────────────────────────── + +function terms = __shorthand_to_terms__ (spec, p, has_intercept) + ## Returns (n_terms x p) matrix (no response column yet) + switch (spec) + case "constant" + if (has_intercept) + terms = zeros (1, p); + else + terms = zeros (0, p); + endif + + case "linear" + if (has_intercept) + terms = [zeros(1, p); eye(p)]; + else + terms = eye (p); + endif + + case "interactions" + lin_terms = eye (p); + inter_rows = []; + for i = 1:p-1 + for j = i+1:p + r = zeros (1, p); + r(i) = 1; r(j) = 1; + inter_rows = [inter_rows; r]; + endfor + endfor + if (has_intercept) + terms = [zeros(1,p); lin_terms; inter_rows]; + else + terms = [lin_terms; inter_rows]; + endif + + case "purequadratic" + lin_terms = eye (p); + quad_rows = 2 * eye (p); + if (has_intercept) + terms = [zeros(1,p); lin_terms; quad_rows]; + else + terms = [lin_terms; quad_rows]; + endif + + case {"quadratic", "full"} + lin_terms = eye (p); + inter_rows = []; + for i = 1:p-1 + for j = i+1:p + r = zeros (1, p); + r(i) = 1; r(j) = 1; + inter_rows = [inter_rows; r]; + endfor + endfor + quad_rows = 2 * eye (p); + if (has_intercept) + terms = [zeros(1,p); lin_terms; inter_rows; quad_rows]; + else + terms = [lin_terms; inter_rows; quad_rows]; + endif + + otherwise + ## Check for polyNM pattern (e.g., 'poly22', 'poly33') + m = regexp (spec, '^poly(\d+)$', 'tokens'); + if (! isempty (m)) + digits_str = m{1}{1}; + if (numel (digits_str) != p) + error ("fitlm: 'poly%s' requires %d digits for %d predictors.", ... + digits_str, p, p); + endif + max_powers = arrayfun (@(c) str2double (c), digits_str); + terms = __poly_to_terms__ (max_powers, p, has_intercept); + else + ## Unknown — default to linear + if (has_intercept) + terms = [zeros(1, p); eye(p)]; + else + terms = eye (p); + endif + endif + endswitch +endfunction + + +## ── Formula RHS → terms_pred ───────────────────────────────────────────────── + +function terms = __parse_rhs_terms__ (rhs, pred_names, has_intercept) + ## Parse a Wilkinson RHS string with full operator support: + ## +, -, *, /, :, ^, (group)^N + ## Returns (n_terms x p) matrix + p = numel (pred_names); + + ## Split into positive and negative terms (respecting parentheses) + [add_toks, sub_toks, has_int_in_rhs, remove_int] = __split_additive__ (rhs); + + ## Expand each positive token into term rows + add_rows = {}; + for i = 1:numel (add_toks) + expanded = __expand_token__ (add_toks{i}, pred_names, p); + if (isempty (expanded)) + error ("The model formula contains names not recognized as predictor or response names."); + endif + add_rows = [add_rows, expanded]; + endfor + + ## Remove subtracted terms + for i = 1:numel (sub_toks) + expanded = __expand_token__ (sub_toks{i}, pred_names, p); + for j = 1:numel (expanded) + add_rows = __remove_row__ (add_rows, expanded{j}); + endfor + endfor + + ## Deduplicate + add_rows = __dedup_rows__ (add_rows); + + ## Sort by MATLAB canonical order: (total_degree, max_power, first_var_index) + if (numel (add_rows) > 1) + sort_keys = zeros (numel (add_rows), 3); + for i = 1:numel (add_rows) + row = add_rows{i}; + sort_keys(i, 1) = sum (row); ## total degree + sort_keys(i, 2) = max (row); ## max individual power + nz = find (row > 0); + if (! isempty (nz)) + sort_keys(i, 3) = nz(1); ## first nonzero variable index + endif + endfor + [~, ord] = sortrows (sort_keys); + add_rows = add_rows(ord); + endif + + ## Assemble terms matrix + n_terms = numel (add_rows); + terms_body = zeros (n_terms, p); + for i = 1:n_terms + terms_body(i, :) = add_rows{i}; + endfor + + ## Handle intercept + if (remove_int) + has_intercept = false; + endif + if (has_intercept || has_int_in_rhs) + terms = [zeros(1, p); terms_body]; + else + terms = terms_body; + endif +endfunction + + +## ── LinearPredictor string builder ─────────────────────────────────────────── + +function s = __build_lp_str__ (terms_pred, pred_names, has_intercept) + parts = {}; + for i = 1:rows (terms_pred) + row = terms_pred(i, :); + if (all (row == 0)) + parts{end+1} = "1"; + else + active = find (row); + tparts = {}; + for j = 1:numel (active) + v = active(j); + pwr = row(v); + if (pwr == 1) + tparts{end+1} = pred_names{v}; + else + tparts{end+1} = sprintf ("%s^%d", pred_names{v}, pwr); + endif + endfor + parts{end+1} = strjoin (tparts, ":"); + endif + endfor + s = strjoin (parts, " + "); +endfunction + + +## ── Split RHS on + and - (respecting parentheses) ─────────────────────────── + +function [add_toks, sub_toks, has_int, remove_int] = __split_additive__ (rhs) + add_toks = {}; + sub_toks = {}; + has_int = false; + remove_int = false; + rhs = strtrim (rhs); + if (isempty (rhs)) + return; + endif + depth = 0; + current = ''; + sign = 1; + for i = 1:numel (rhs) + ch = rhs(i); + if (ch == '(') + depth++; + current = [current, ch]; + elseif (ch == ')') + depth--; + current = [current, ch]; + elseif (depth == 0 && (ch == '+' || ch == '-')) + tok = strtrim (current); + if (! isempty (tok)) + if (strcmp (tok, '1')) + if (sign == -1), remove_int = true; else, has_int = true; endif + else + if (sign == -1), sub_toks{end+1} = tok; else, add_toks{end+1} = tok; endif + endif + endif + current = ''; + if (ch == '+'), sign = 1; else, sign = -1; endif + else + current = [current, ch]; + endif + endfor + tok = strtrim (current); + if (! isempty (tok)) + if (strcmp (tok, '1')) + if (sign == -1), remove_int = true; else, has_int = true; endif + else + if (sign == -1), sub_toks{end+1} = tok; else, add_toks{end+1} = tok; endif + endif + endif +endfunction + + +## ── Expand a single token into term rows ───────────────────────────────────── + +function rows = __expand_token__ (tok, pred_names, p) + tok = strtrim (tok); + rows = {}; + + ## Case 1: (group)^N + m = regexp (tok, '^\((.+)\)\^(\d+)$', 'tokens'); + if (! isempty (m)) + inner_str = m{1}{1}; + power = str2double (m{1}{2}); + inner_toks = strtrim (strsplit (inner_str, '+')); + inner_vars = []; + for i = 1:numel (inner_toks) + idx = find (strcmp (pred_names, strtrim (inner_toks{i}))); + if (! isempty (idx)) + inner_vars(end+1) = idx; + endif + endfor + if (! isempty (inner_vars)) + rows = __group_power_expand__ (inner_vars, power, p); + endif + return; + endif + + ## Case 2: bare (group) + m = regexp (tok, '^\((.+)\)$', 'tokens'); + if (! isempty (m)) + inner_str = m{1}{1}; + [add_t, sub_t, ~, ~] = __split_additive__ (inner_str); + for i = 1:numel (add_t) + tmp_exp = __expand_token__ (add_t{i}, pred_names, p); + rows = [rows, tmp_exp]; + endfor + for i = 1:numel (sub_t) + sub_r = __expand_token__ (sub_t{i}, pred_names, p); + for j = 1:numel (sub_r) + rows = __remove_row__ (rows, sub_r{j}); + endfor + endfor + rows = __dedup_rows__ (rows); + return; + endif + + ## Case 3: crossing (*) + if (__has_at_toplevel__ (tok, '*')) + parts = __split_toplevel__ (tok, '*'); + result = __expand_token__ (parts{1}, pred_names, p); + for i = 2:numel (parts) + right = __expand_token__ (parts{i}, pred_names, p); + result = __crossing_rows__ (result, right); + endfor + rows = result; + return; + endif + + ## Case 4: nesting (/) + if (__has_at_toplevel__ (tok, '/')) + parts = __split_toplevel__ (tok, '/'); + left = __expand_token__ (parts{1}, pred_names, p); + for i = 2:numel (parts) + right = __expand_token__ (parts{i}, pred_names, p); + cross = {}; + for li = 1:numel (left) + for ri = 1:numel (right) + cross{end+1} = left{li} + right{ri}; + endfor + endfor + left = [left, cross]; + endfor + rows = __dedup_rows__ (left); + return; + endif + + ## Case 5: power ladder (var^N) + m = regexp (tok, '^([a-zA-Z_][a-zA-Z0-9_]*)\^(\d+)$', 'tokens'); + if (! isempty (m)) + varname = m{1}{1}; + power = str2double (m{1}{2}); + idx = find (strcmp (pred_names, varname)); + if (! isempty (idx)) + for pwr = 1:power + row = zeros (1, p); + row(idx) = pwr; + rows{end+1} = row; + endfor + return; + endif + endif + + ## Case 6: interaction (A:B) + if (any (tok == ':')) + parts = strtrim (strsplit (tok, ':')); + row = zeros (1, p); + valid = true; + for i = 1:numel (parts) + part = parts{i}; + caret = strfind (part, '^'); + if (! isempty (caret)) + base = strtrim (part(1:caret(1)-1)); + pwr = str2double (part(caret(1)+1:end)); + else + base = part; + pwr = 1; + endif + idx = find (strcmp (pred_names, base)); + if (isempty (idx)) + valid = false; + break; + endif + row(idx(1)) = pwr; + endfor + if (valid) + rows = {row}; + endif + return; + endif + + ## Case 7: simple variable + idx = find (strcmp (pred_names, tok)); + if (! isempty (idx)) + row = zeros (1, p); + row(idx) = 1; + rows = {row}; + endif +endfunction + + +## ── Group power expansion ──────────────────────────────────────────────────── + +function rows = __group_power_expand__ (var_indices, max_power, p) + nv = numel (var_indices); + all_exps = __enum_exponents__ (nv, max_power); + rows = {}; + for i = 1:size (all_exps, 1) + row = zeros (1, p); + for j = 1:nv + row(var_indices(j)) = all_exps(i, j); + endfor + rows{end+1} = row; + endfor +endfunction + + +## ── Enumerate exponent vectors ─────────────────────────────────────────────── + +function E = __enum_exponents__ (nv, max_deg) + E = []; + total = (max_deg + 1) ^ nv; + for i = 0:total-1 + vec = zeros (1, nv); + val = i; + for j = 1:nv + vec(j) = mod (val, max_deg + 1); + val = floor (val / (max_deg + 1)); + endfor + s = sum (vec); + if (s >= 1 && s <= max_deg) + E = [E; vec]; + endif + endfor +endfunction + + +## ── Crossing rows (A*B expansion) ──────────────────────────────────────────── + +function rows = __crossing_rows__ (left, right) + cross = {}; + for li = 1:numel (left) + for ri = 1:numel (right) + cross{end+1} = left{li} + right{ri}; + endfor + endfor + rows = [left, right, cross]; + rows = __dedup_rows__ (rows); +endfunction + + +## ── Remove first matching row ──────────────────────────────────────────────── + +function rows = __remove_row__ (rows, target) + for i = 1:numel (rows) + if (isequal (rows{i}, target)) + rows(i) = []; + return; + endif + endfor +endfunction + + +## ── Deduplicate rows ───────────────────────────────────────────────────────── + +function rows = __dedup_rows__ (rows) + keep = true (numel (rows), 1); + for i = 1:numel (rows) + if (! keep(i)), continue; endif + for j = i+1:numel (rows) + if (keep(j) && isequal (rows{i}, rows{j})) + keep(j) = false; + endif + endfor + endfor + rows = rows(keep); +endfunction + + +## ── Check for char at top level (outside parens) ───────────────────────────── + +function result = __has_at_toplevel__ (tok, ch) + result = false; + depth = 0; + for i = 1:numel (tok) + if (tok(i) == '('), depth++; + elseif (tok(i) == ')'), depth--; + elseif (depth == 0 && tok(i) == ch) + result = true; + return; + endif + endfor +endfunction + + +## ── Split on char at top level ─────────────────────────────────────────────── + +function parts = __split_toplevel__ (tok, ch) + parts = {}; + depth = 0; + current = ''; + for i = 1:numel (tok) + if (tok(i) == '('), depth++; current = [current, tok(i)]; + elseif (tok(i) == ')'), depth--; current = [current, tok(i)]; + elseif (depth == 0 && tok(i) == ch) + parts{end+1} = strtrim (current); + current = ''; + else + current = [current, tok(i)]; + endif + endfor + if (! isempty (current)) + parts{end+1} = strtrim (current); + endif +endfunction + + +## ── Polynomial shorthand to terms ──────────────────────────────────────────── + +function terms = __poly_to_terms__ (max_powers, p, has_intercept) + max_deg = max (max_powers); + exps = cell (1, p); + for j = 1:p + exps{j} = 0:max_powers(j); + endfor + grids = cell (1, p); + [grids{:}] = ndgrid (exps{:}); + n_combos = numel (grids{1}); + all_rows = zeros (n_combos, p); + for j = 1:p + all_rows(:, j) = grids{j}(:); + endfor + total_deg = sum (all_rows, 2); + keep = (total_deg >= 1) & (total_deg <= max_deg); + terms_body = all_rows(keep, :); + [~, ord] = sortrows ([sum(terms_body, 2), terms_body]); + terms_body = terms_body(ord, :); + if (has_intercept) + terms = [zeros(1, p); terms_body]; + else + terms = terms_body; + endif +endfunction diff --git a/inst/private/__fitlm_build_diagnostics__.m b/inst/private/__fitlm_build_diagnostics__.m new file mode 100644 index 00000000..dae490ff --- /dev/null +++ b/inst/private/__fitlm_build_diagnostics__.m @@ -0,0 +1,182 @@ +## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com> +## +## This file is part of the statistics package for GNU Octave. +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Private Function} {[@var{Fitted}, @var{Residuals}, @var{Diagnostics}, @ +## @var{MFVNM}] =} __fitlm_build_diagnostics__ (@var{s}, @var{X_full}, @ +## @var{y_full}, @var{incl_mask}, @var{excl_mask}) +## +## Expand lm_fit_engine output from n_used rows to full n rows. +## +## Key behaviors confirmed from MATLAB Phase 4 cross-validation: +## - Excluded rows: Fitted computed (X*beta), Residuals = NaN +## - Missing rows (NaN in X or y): Fitted = NaN, Residuals = NaN +## - Diagnostics: Leverage=0, CooksD/Dffits/S2_i/CovRatio=NaN for non-included rows +## +## @var{s} is the struct returned by @code{lm_fit_engine}. +## @var{X_full} is the full n x p_tot design matrix. +## @var{y_full} is the full n x 1 response (NaN for missing rows). +## @var{incl_mask} is n x 1 logical: rows used in fit. +## @var{excl_mask} is n x 1 logical: rows explicitly excluded. +## @end deftypefn + +function [Fitted, Residuals, Diagnostics, MFVNM] = ... + __fitlm_build_diagnostics__ (s, X_full, y_full, incl_mask, excl_mask) + + n = rows (X_full); + p_est = s.p_est; + mse = s.mse; + dfe = s.dfe; + + ## ── Fitted ────────────────────────────────────────────────────────────── + Fitted = NaN (n, 1); + + ## Included rows + Fitted(incl_mask) = s.yhat; + + ## Excluded-but-not-missing rows: compute X * beta + missing_mask = ! incl_mask & ! excl_mask; ## true = missing (NaN in data) + excl_not_missing = excl_mask & ! missing_mask; + if (any (excl_not_missing)) + Fitted(excl_not_missing) = X_full(excl_not_missing, :) * s.beta; + endif + ## Missing rows: Fitted stays NaN + + ## ── Residuals (n x 4 struct) ──────────────────────────────────────────── + raw_r = NaN (n, 1); + pear_r = NaN (n, 1); + stud_r = NaN (n, 1); + std_r = NaN (n, 1); + + ## Leverage for included rows + h_inc = s.h; ## n_used x 1 + + ## Pearson = raw / sqrt(MSE) + ## Standardized = raw / sqrt(MSE * (1 - h)) + ## Studentized (externally) = raw / sqrt(S2_i * (1 - h)) + ## where S2_i = (dfe * MSE - raw^2 / (1-h)) / (dfe - 1) + + raw_inc = s.resid; + + raw_r(incl_mask) = raw_inc; + pear_r(incl_mask) = raw_inc / sqrt (mse); + std_denom = sqrt (mse * (1 - h_inc)); + std_r(incl_mask) = raw_inc ./ std_denom; + + ## Studentized (leave-one-out MSE) + s2_i_inc = max (0, (dfe * mse - raw_inc .^ 2 ./ (1 - h_inc)) / max (1, dfe - 1)); + stud_denom = sqrt (s2_i_inc .* (1 - h_inc)); + stud_r_inc = raw_inc ./ stud_denom; + stud_r_inc(stud_denom == 0) = 0; + stud_r(incl_mask) = stud_r_inc; + + Residuals.Raw = raw_r; + Residuals.Pearson = pear_r; + Residuals.Studentized = stud_r; + Residuals.Standardized = std_r; + + ## ── Diagnostics (n x 7 struct) ────────────────────────────────────────── + lev = zeros (n, 1); + cooks = NaN (n, 1); + dffits_v = NaN (n, 1); + s2_i_v = NaN (n, 1); + covratio_v = NaN (n, 1); + dfbetas_v = zeros (n, p_est); + hatmat = zeros (n, n); + + ## Build hat matrix block for included rows + Q_inc = s.Q; ## n_used x p_est + H_inc = Q_inc * Q_inc'; ## n_used x n_used + h_diag = h_inc; + + lev(incl_mask) = h_diag; + + ## Hat matrix: fill in the included rows/cols + inc_idx = find (incl_mask); + for ii = 1:numel (inc_idx) + for jj = 1:numel (inc_idx) + hatmat(inc_idx(ii), inc_idx(jj)) = H_inc(ii, jj); + endfor + endfor + + ## Cook's distance: D_i = (raw_i^2 * h_i) / (p_est * MSE * (1-h_i)^2) + h_inc_safe = max (h_inc, eps); + cooks_inc = (raw_inc .^ 2 .* h_inc) ./ (p_est * mse * (1 - h_inc) .^ 2); + cooks(incl_mask) = cooks_inc; + + ## DFFITS: raw / sqrt(S2_i * h) + dffits_inc = raw_inc ./ sqrt (s2_i_inc .* h_inc_safe); + dffits_inc(h_inc == 0) = 0; + dffits_v(incl_mask) = dffits_inc; + + ## S2_i (leave-one-out variance) + s2_i_v(incl_mask) = s2_i_inc; + + ## CovRatio: (1 / (1-h)) ^ p_est * (dfe / (dfe - 1) + raw^2 / (mse * (dfe-1) * (1-h)))^(-p_est) + ## Simpler: (S2_i / MSE)^p_est / (1-h) + s2_ratio = s2_i_inc / max (mse, eps); + covrat_inc = (s2_ratio .^ p_est) ./ (1 - h_inc); + covrat_inc(h_inc >= 1) = NaN; + covratio_v(incl_mask) = covrat_inc; + + ## DFBETAS: (beta_full - beta_(-i)) / sqrt(S2_i * diag(XtX_inv)) + ## Approximation via s.Q: dfbetas_i = (Q(i,:)' * resid_i) / (1-h_i) / sqrt(S2_i * diag(vcov)) + vcov_diag = diag (s.vcov); ## p_est x 1 (from engine vcov sized p_tot x p_tot) + if (numel (vcov_diag) > p_est) + vcov_diag = vcov_diag(1:p_est); + endif + se_beta = sqrt (max (vcov_diag, 0)); ## p_est x 1 + + for ii = 1:numel (inc_idx) + ri = raw_inc(ii); + hi = h_inc(ii); + s2i = s2_i_inc(ii); + qi = Q_inc(ii, :)'; ## p_est x 1 + denom = (1 - hi) .* sqrt (s2i) .* se_beta; + denom(denom == 0) = NaN; + dfb = (ri ./ (1 - hi)) * qi ./ sqrt (s2i .* max (vcov_diag, eps)); + dfbetas_v(inc_idx(ii), 1:p_est) = dfb'; + endfor + + Diagnostics.Leverage = lev; + Diagnostics.CooksDistance = cooks; + Diagnostics.Dffits = dffits_v; + Diagnostics.S2_i = s2_i_v; + Diagnostics.CovRatio = covratio_v; + Diagnostics.Dfbetas = dfbetas_v; + Diagnostics.HatMatrix = hatmat; + + ## ── ModelFitVsNullModel ───────────────────────────────────────────────── + ## Check if model has intercept (intercept row = zeros(1,p) in terms_pred) + ## We detect via lp_str starting with '1' + has_int = (s.p_est > 0) && (s.ssr > 0 || s.sse > 0); + + ## If model has intercept: compute F-stat and p-value + ## If no intercept: all NaN (confirmed Block 10-P4) + ## + ## We check via DFR = p_est - 1 if intercept, else p_est + ## However we don't have has_intercept here. Use heuristic: + %% detect from sst — if sst == sse+ssr and dfe = n-p_est, we can check + %% whether the design matrix has an intercept by seeing if beta(1) was ~unconstrained + %% Simplest: pass intercept flag when calling this function. + %% Since we don't have it here, let the caller override MFVNM. + + MFVNM.Fstat = NaN; + MFVNM.Pvalue = NaN; + MFVNM.NullModel = NaN; + +endfunction diff --git a/inst/private/__fitlm_build_formula__.m b/inst/private/__fitlm_build_formula__.m new file mode 100644 index 00000000..b45a6d70 --- /dev/null +++ b/inst/private/__fitlm_build_formula__.m @@ -0,0 +1,87 @@ +## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com> +## +## This file is part of the statistics package for GNU Octave. +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Private Function} {@var{Formula} =} __fitlm_build_formula__ (@ +## @var{terms_mat}, @var{var_names}, @var{pred_names}, @var{resp_name}, @ +## @var{has_intercept}, @var{lp_str}, @var{coef_names}) +## +## Build the Formula struct for a LinearModel. +## +## @var{terms_mat} is the (n_terms x n_all_vars) binary matrix including +## the response column (always 0). +## +## Returns a struct with the 14 confirmed fields from MATLAB Block 1-P4. +## @end deftypefn + +function Formula = __fitlm_build_formula__ (terms_mat, var_names, pred_names, ... + resp_name, has_intercept, lp_str, coef_names) + + n_terms = rows (terms_mat); + n_vars = columns (terms_mat); ## includes response column + n_pred = numel (pred_names); + + ## Build TermNames from terms_mat (using pred columns only) + terms_pred = terms_mat(:, 1:n_pred); + term_names = cell (n_terms, 1); + for i = 1:n_terms + row = terms_pred(i, :); + if (all (row == 0)) + term_names{i} = "(Intercept)"; + else + active = find (row); + parts = {}; + for j = 1:numel (active) + v = active(j); + pwr = row(v); + if (pwr == 1) + parts{end+1} = pred_names{v}; + else + parts{end+1} = sprintf ("%s^%d", pred_names{v}, pwr); + endif + endfor + term_names{i} = strjoin (parts, ":"); + endif + endfor + + ## InModel: logical vector over var_names (1 = predictor in model, 0 = response) + ## Length = n_vars. Response column is always 0. A predictor is InModel if + ## any non-intercept term has a non-zero entry in that predictor's column. + in_model = false (1, n_vars); + for j = 1:n_pred + if (any (terms_pred(:, j) != 0)) + in_model(j) = true; + endif + endfor + ## Response column (last) = 0 (already false) + + Formula.LinearPredictor = lp_str; + Formula.ResponseName = resp_name; + Formula.Terms = terms_mat; + Formula.VariableNames = var_names; ## e.g. {'x1','x2','x3','y'} + Formula.HasIntercept = logical (has_intercept); + Formula.InModel = in_model; + Formula.TermNames = term_names; + Formula.PredictorNames = pred_names; + Formula.NTerms = n_terms; + Formula.NVars = n_vars; + Formula.NPredictors = n_pred; + Formula.Link = "identity"; + Formula.ModelFun = @(b, X) X * b; + Formula.FunctionCalls = {}; + +endfunction diff --git a/inst/private/__fitlm_build_obsinfo__.m b/inst/private/__fitlm_build_obsinfo__.m new file mode 100644 index 00000000..622350ed --- /dev/null +++ b/inst/private/__fitlm_build_obsinfo__.m @@ -0,0 +1,46 @@ +## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com> +## +## This file is part of the statistics package for GNU Octave. +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Private Function} {@var{ObservationInfo} =} __fitlm_build_obsinfo__ (@ +## @var{weights}, @var{excl_mask}, @var{raw_X}, @var{raw_y}, @var{obs_names}) +## +## Build the ObservationInfo table for a LinearModel. +## +## Confirmed column names from MATLAB Block 7-P4: +## {Weights, Excluded, Missing, Subset} +## Subset = true means the row was used in the fit (not excluded, not missing). +## @end deftypefn + +function ObservationInfo = __fitlm_build_obsinfo__ (weights, excl_mask, raw_X, raw_y, obs_names) + + n = rows (raw_y); + + ## Missing: any NaN in X rows or y + nan_in_X = any (isnan (raw_X), 2); + nan_in_y = isnan (raw_y); + missing = logical (nan_in_X | nan_in_y); + + excluded = logical (excl_mask(:)); + subset = (!excluded) & (!missing); + + ObservationInfo.Weights = weights(:); + ObservationInfo.Excluded = excluded; + ObservationInfo.Missing = missing; + ObservationInfo.Subset = subset; + +endfunction diff --git a/inst/private/__fitlm_build_varinfo__.m b/inst/private/__fitlm_build_varinfo__.m new file mode 100644 index 00000000..d802a6f7 --- /dev/null +++ b/inst/private/__fitlm_build_varinfo__.m @@ -0,0 +1,101 @@ +## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com> +## +## This file is part of the statistics package for GNU Octave. +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Private Function} {@var{VariableInfo} =} __fitlm_build_varinfo__ (@ +## @var{var_names}, @var{cat_flags}, @var{raw_cols}, @var{in_model}) +## +## Build the VariableInfo struct for a LinearModel. +## +## Confirmed from MATLAB Block 2-P4: +## VariableInfo is a STRUCT (mirroring table layout) with fields: +## Class - cell of char: 'double', 'categorical', etc. +## Range - cell of cell: {[min max]} for numeric, {[sorted_uniq]} for categorical +## InModel - logical column vector +## IsCategorical - logical column vector +## +## Row names are stored separately (= var_names). +## +## @var{in_model} is a logical row vector of length numel(var_names): true for +## each predictor that appears in the model (false for response). +## @end deftypefn + +function VariableInfo = __fitlm_build_varinfo__ (var_names, cat_flags, raw_cols, in_model) + + n_vars = numel (var_names); + + Class = cell (n_vars, 1); + Range = cell (n_vars, 1); + InModel = false (n_vars, 1); + IsCategorical = false (n_vars, 1); + + for j = 1:n_vars + col = raw_cols{j}; + + ## IsCategorical: from cat_flags (set by CategoricalVars or type detection) + IsCategorical(j) = logical (cat_flags(j)); + + ## Class + if (isa (col, "categorical")) + Class{j} = "categorical"; + elseif (iscellstr (col) || isstring (col)) + Class{j} = "cell"; + elseif (isnumeric (col)) + Class{j} = "double"; + else + Class{j} = class (col); + endif + + ## Range + if (IsCategorical(j)) + ## Sorted unique levels + if (isa (col, "categorical")) + levs = cellstr (categories (col)); + elseif (iscellstr (col)) + levs = unique (col); + elseif (isstring (col)) + levs = unique (cellstr (col)); + else + ## Numeric forced categorical — unique numeric values + uvals = unique (col(!isnan (col))); + levs = uvals(:)'; + endif + Range{j} = {levs}; + else + ## Numeric — [min max] + if (isnumeric (col)) + valid = col(!isnan (col)); + if (isempty (valid)) + Range{j} = {[NaN NaN]}; + else + Range{j} = {[min(valid), max(valid)]}; + endif + else + Range{j} = {[]}; + endif + endif + + ## InModel + InModel(j) = logical (in_model(j)); + endfor + + VariableInfo.Class = Class; + VariableInfo.Range = Range; + VariableInfo.InModel = InModel; + VariableInfo.IsCategorical = IsCategorical; + +endfunction diff --git a/inst/private/__fitlm_parse_inputs__.m b/inst/private/__fitlm_parse_inputs__.m new file mode 100644 index 00000000..b7974e9b --- /dev/null +++ b/inst/private/__fitlm_parse_inputs__.m @@ -0,0 +1,347 @@ +## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com> +## +## This file is part of the statistics package for GNU Octave. +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Private Function} {[@var{raw_X}, @var{raw_y}, @var{var_names}, @ +## @var{cat_flags}, @var{weights}, @var{excl_mask}, @var{modelspec}, @ +## @var{obs_names}, @var{resp_name}, @var{pred_names}, @var{has_intercept}] =} @ +## __fitlm_parse_inputs__ (@var{varargin}) +## +## Parse fitlm inputs into clean typed outputs. +## +## Returns: +## raw_X - n x p numeric matrix (NaN in missing rows preserved) +## raw_y - n x 1 numeric response vector +## var_names - 1 x (p+1) cell of char: {pred_names..., resp_name} +## cat_flags - 1 x (p+1) logical: IsCategorical for each variable +## weights - n x 1 double (1.0 if unweighted) +## excl_mask - n x 1 logical (true = explicitly excluded) +## modelspec - char: Wilkinson formula string OR shorthand string +## obs_names - cell of char (row names or {}) +## resp_name - char: response variable name +## pred_names - 1 x p cell of char: predictor variable names +## has_intercept - logical: true unless explicitly suppressed +## raw_cols - cell of original columns (for VariableInfo building) +## @end deftypefn + +function [raw_X, raw_y, var_names, cat_flags, weights, excl_mask, ... + modelspec, obs_names, resp_name, pred_names, has_intercept, ... + raw_cols, dummy_coding] = ... + __fitlm_parse_inputs__ (varargin) + + if (numel (varargin) < 1) + error ("fitlm: at least one input argument required."); + endif + + ## --- Detect first argument: table or matrix --- + is_table_input = isa (varargin{1}, "table"); + + ## Initialize name-value defaults + weights_arg = []; + excl_arg = []; + intercept_arg = true; + cat_vars_arg = []; ## integer indices or {} + var_names_arg = {}; ## VarNames override + resp_var_arg = ""; ## ResponseVar override + pred_vars_arg = {}; ## PredictorVars override + + if (is_table_input) + ## ── TABLE INPUT ────────────────────────────────────────────────────────── + tbl = varargin{1}; + varargin(1) = []; + + ## Extract table metadata + all_col_names = tbl.Properties.VariableNames; + n_vars_tbl = numel (all_col_names); + + ## ObservationNames from row names + obs_names = tbl.Properties.RowNames; ## cell or {} if unset + + ## Parse optional modelspec (2nd positional arg if char/not a NV key, + ## or a numeric terms matrix) + modelspec = "linear"; ## default shorthand + if (! isempty (varargin) && isnumeric (varargin{1}) && ... + ismatrix (varargin{1}) && ndims (varargin{1}) == 2) + ## Numeric terms matrix passed directly + modelspec = varargin{1}; + varargin(1) = []; + elseif (! isempty (varargin) && ischar (varargin{1}) && ... + ! __is_nv_key__ (varargin{1})) + modelspec = varargin{1}; + varargin(1) = []; + endif + + ## Parse name-value pairs + [weights_arg, excl_arg, intercept_arg, cat_vars_arg, var_names_arg, ... + resp_var_arg, pred_vars_arg, dummy_coding_arg] = __parse_nv__ (varargin{:}); + + ## Determine response and predictor variables + if (! isempty (resp_var_arg)) + resp_name = resp_var_arg; + elseif (! isempty (strfind (modelspec, "~"))) + ## Formula string — response is LHS + tilde_pos = strfind (modelspec, "~"); + resp_name = strtrim (modelspec(1:tilde_pos(1)-1)); + else + ## Default: last column of table + resp_name = all_col_names{end}; + endif + + ## Predictor names + if (! isempty (pred_vars_arg)) + if (iscell (pred_vars_arg)) + pred_names = pred_vars_arg; + else + pred_names = all_col_names(pred_vars_arg); + endif + else + ## All columns except response + pred_names = all_col_names(! strcmp (all_col_names, resp_name)); + endif + + ## var_names: predictor order then response (matches MATLAB Block 4-P4) + var_names = [pred_names, {resp_name}]; + + ## Extract raw columns + n = rows (tbl); + p = numel (pred_names); + raw_X = zeros (n, p); + raw_cols = cell (1, p + 1); + + cat_flags = false (1, p + 1); + + ## Handle CategoricalVars — may be names or indices into pred_names + cat_idx_set = __resolve_cat_vars__ (cat_vars_arg, pred_names); + + for k = 1:p + col = tbl.(pred_names{k}); + raw_cols{k} = col; + if (isa (col, "categorical")) + ## Store numeric codes — raw_X stays as codes for design builder + cat_flags(k) = true; + raw_X(:, k) = double (col); ## integer codes (1-based after sort) + elseif (iscellstr (col) || isstring (col)) + cat_flags(k) = true; + raw_X(:, k) = NaN; ## placeholder; design builder uses raw_cols + elseif (ismember (k, cat_idx_set)) + cat_flags(k) = true; + raw_X(:, k) = col; + else + raw_X(:, k) = col; + endif + endfor + + ## Response column + raw_y = tbl.(resp_name); + raw_y = raw_y(:); + raw_cols{p+1} = raw_y; + + ## cat_flags for response (always false) + cat_flags(p+1) = false; + + ## No VarNames override for table input + + else + ## ── MATRIX INPUT ───────────────────────────────────────────────────────── + X_in = varargin{1}; + if (nargin < 2 || numel (varargin) < 2) + error ("fitlm: matrix input requires at least X and y arguments."); + endif + y_in = varargin{2}(:); + varargin(1:2) = []; + + n = rows (X_in); + p = columns (X_in); + obs_names = {}; + + ## Parse optional modelspec (char string, shorthand, or numeric terms matrix) + modelspec = "linear"; + if (! isempty (varargin) && isnumeric (varargin{1}) && ... + ismatrix (varargin{1}) && ndims (varargin{1}) == 2) + ## Numeric terms matrix passed directly + modelspec = varargin{1}; + varargin(1) = []; + elseif (! isempty (varargin) && ischar (varargin{1}) && ... + ! __is_nv_key__ (varargin{1})) + modelspec = varargin{1}; + varargin(1) = []; + endif + + ## Parse name-value pairs + [weights_arg, excl_arg, intercept_arg, cat_vars_arg, var_names_arg, ... + resp_var_arg, pred_vars_arg, dummy_coding_arg] = __parse_nv__ (varargin{:}); + + ## Variable names + if (! isempty (var_names_arg)) + ## VarNames override: last = response, rest = predictors + if (numel (var_names_arg) != p + 1) + error ("fitlm: VarNames must have %d elements (p predictors + 1 response).", p+1); + endif + pred_names = var_names_arg(1:p); + resp_name = var_names_arg{p+1}; + else + ## Auto-generate: x1...xp and y + pred_names = arrayfun (@(j) sprintf ("x%d", j), 1:p, "UniformOutput", false); + resp_name = "y"; + endif + + var_names = [pred_names, {resp_name}]; + + raw_X = X_in; + raw_y = y_in; + + ## cat_flags — start all false, then mark CategoricalVars + cat_flags = false (1, p + 1); + cat_idx_set = __resolve_cat_vars__ (cat_vars_arg, pred_names); + if (! isempty (cat_idx_set)) + cat_flags(cat_idx_set) = true; + endif + + ## raw_cols for VariableInfo building + raw_cols = cell (1, p + 1); + for k = 1:p + raw_cols{k} = raw_X(:, k); + endfor + raw_cols{p+1} = raw_y; + + endif ## is_table_input + + ## --- Weights --- + if (isempty (weights_arg)) + weights = ones (n, 1); + else + weights = weights_arg(:); + if (numel (weights) != n) + error ("fitlm: Weights must have %d elements.", n); + endif + endif + + ## --- Exclusion mask --- + excl_mask = false (n, 1); + if (! isempty (excl_arg)) + if (islogical (excl_arg)) + excl_mask = logical (excl_arg(:)); + else + idx = round (excl_arg(:)); + excl_mask(idx) = true; + endif + endif + + ## --- Intercept --- + has_intercept = logical (intercept_arg); + + ## If modelspec is a formula string, extract has_intercept from it + if (ischar (modelspec) && ! isempty (strfind (modelspec, "~"))) + has_intercept = isempty (regexp (modelspec, '-\s*1')); + if (! has_intercept) + ## '-1' suppresses intercept; strip it for later parsing + endif + endif + + + ## DummyVarCoding + dummy_coding = dummy_coding_arg; + + ## If Intercept=false was explicitly given, override + if (! intercept_arg) + has_intercept = false; + endif + +endfunction + +## ── Internal helpers ───────────────────────────────────────────────────────── + +function result = __is_nv_key__ (s) + ## Return true if s is a known name-value argument name. + ## Everything that is NOT a known key and contains '~' or is a shorthand + ## is passed as modelspec. + known = {"weights", "exclude", "intercept", "categoricalvars", ... + "varnames", "robustopts", "responsevar", "predictorvars", ... + "dummyvarcoding"}; + result = any (strcmpi (s, known)); +endfunction + +function [weights_arg, excl_arg, intercept_arg, cat_vars_arg, var_names_arg, ... + resp_var_arg, pred_vars_arg, dummy_coding_arg] = __parse_nv__ (varargin) + weights_arg = []; + excl_arg = []; + intercept_arg = true; + cat_vars_arg = []; + var_names_arg = {}; + resp_var_arg = ""; + pred_vars_arg = {}; + dummy_coding_arg = "reference"; + + i = 1; + while (i <= numel (varargin)) + if (! ischar (varargin{i})) + error ("fitlm: expected name-value argument name at position %d.", i); + endif + key = lower (varargin{i}); + if (i + 1 > numel (varargin)) + error ("fitlm: name-value argument '%s' has no value.", varargin{i}); + endif + val = varargin{i+1}; + switch (key) + case "weights" + weights_arg = val; + case "exclude" + excl_arg = val; + case "intercept" + intercept_arg = logical (val); + case "categoricalvars" + cat_vars_arg = val; + case "varnames" + var_names_arg = val; + case "robustopts" + ## Accepted but not implemented in Phase 4 + case "dummyvarcoding" + dummy_coding_arg = lower (val); + case "responsevar" + resp_var_arg = val; + case "predictorvars" + pred_vars_arg = val; + otherwise + warning ("fitlm: ignoring unknown name-value argument '%s'.", varargin{i}); + endswitch + i = i + 2; + endwhile +endfunction + +function idx_set = __resolve_cat_vars__ (cat_vars_arg, pred_names) + ## Return integer indices (into pred_names) of categorical predictors. + idx_set = []; + if (isempty (cat_vars_arg)) + return; + endif + p = numel (pred_names); + if (isnumeric (cat_vars_arg)) + idx_set = round (cat_vars_arg(:))'; + elseif (iscell (cat_vars_arg)) + for k = 1:numel (cat_vars_arg) + m = find (strcmp (pred_names, cat_vars_arg{k})); + if (! isempty (m)) + idx_set(end+1) = m; + endif + endfor + elseif (ischar (cat_vars_arg)) + m = find (strcmp (pred_names, cat_vars_arg)); + if (! isempty (m)) + idx_set = m; + endif + endif +endfunction diff --git a/inst/private/__stepwiselm_candidates__.m b/inst/private/__stepwiselm_candidates__.m new file mode 100644 index 00000000..de8d92ef --- /dev/null +++ b/inst/private/__stepwiselm_candidates__.m @@ -0,0 +1,219 @@ +## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com> +## +## This file is part of the statistics package for GNU Octave. +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Private Function} {@var{candidates} =} __stepwiselm_candidates__ @ +## (@var{current_terms}, @var{lower_terms}, @var{upper_terms}, @ +## @var{cat_flags}, @var{pred_names}) +## +## Enumerate candidate terms for adding or removing in stepwise regression. +## +## @strong{Inputs} +## +## @table @var +## @item current_terms +## Binary terms matrix of the current model (n_terms x n_vars). +## Includes the response column (last column, always 0). +## @item lower_terms +## Binary terms matrix of the lower (minimum) model. +## @item upper_terms +## Binary terms matrix of the upper (maximum) model. +## @item cat_flags +## Logical vector (1 x n_vars) indicating categorical variables. +## @item pred_names +## Cell array of predictor variable names. +## @end table +## +## @strong{Output} +## +## @var{candidates} is a struct array. Each element has fields: +## @table @code +## @item term_row +## Binary row vector (1 x n_vars) for this term. +## @item term_name +## Char string: e.g., @code{'x1'}, @code{'x1:x2'}, @code{'Species'}. +## @item action +## @code{'add'} or @code{'remove'}. +## @item df_term +## Number of design-matrix columns this term contributes: +## 1 for numeric, k-1 for k-level categorical, products for interactions. +## @end table +## +## @seealso{stepwiselm, __stepwiselm_ftest__} +## @end deftypefn + +function candidates = __stepwiselm_candidates__ (current_terms, lower_terms, ... + upper_terms, cat_flags, ... + pred_names) + + candidates = struct ("term_row", {}, "term_name", {}, ... + "action", {}, "df_term", {}); + + p = numel (pred_names); ## number of predictors (excl. response) + n_vars = columns (current_terms); ## p + 1 (includes response col) + + ## ── REMOVE candidates ───────────────────────────────────────────────────── + ## Terms in current_terms that are NOT in lower_terms and NOT the intercept. + for i = 1:rows (current_terms) + row = current_terms(i, :); + + ## Skip intercept (all-zeros row) + if (all (row == 0)) + continue; + endif + + ## Check if this term is in lower_terms (protected) + is_lower = false; + for j = 1:rows (lower_terms) + if (isequal (row, lower_terms(j, :))) + is_lower = true; + break; + endif + endfor + if (is_lower) + continue; + endif + + ## This is a valid removal candidate + c.term_row = row; + c.term_name = __term_row_to_name__ (row, pred_names, p); + c.action = "remove"; + c.df_term = __compute_df_term__ (row, cat_flags, pred_names, p); + candidates(end+1) = c; + endfor + + ## ── ADD candidates ──────────────────────────────────────────────────────── + ## Terms in upper_terms that are NOT in current_terms. + for i = 1:rows (upper_terms) + row = upper_terms(i, :); + + ## Skip intercept (it's always in the model) + if (all (row == 0)) + continue; + endif + + ## Check if already in current_terms + already_in = false; + for j = 1:rows (current_terms) + if (isequal (row, current_terms(j, :))) + already_in = true; + break; + endif + endfor + if (already_in) + continue; + endif + + ## Valid add candidate + c.term_row = row; + c.term_name = __term_row_to_name__ (row, pred_names, p); + c.action = "add"; + c.df_term = __compute_df_term__ (row, cat_flags, pred_names, p); + candidates(end+1) = c; + endfor + +endfunction + + +## ── Helper: term row → human-readable name ────────────────────────────────── + +function name = __term_row_to_name__ (row, pred_names, p) + ## Convert a terms matrix row (1 x n_vars) to a human-readable term name + ## like 'x1', 'x1:x2', 'Species', 'x1^2'. + ## + ## The row may have width = p or p+1 (with response column). + ## We only look at columns 1..p. + + active = find (row(1:p)); + if (isempty (active)) + name = "1"; ## intercept + return; + endif + + parts = cell (1, numel (active)); + for k = 1:numel (active) + v = active(k); + pwr = row(v); + if (pwr == 1) + parts{k} = pred_names{v}; + else + parts{k} = sprintf ("%s^%d", pred_names{v}, pwr); + endif + endfor + name = strjoin (parts, ":"); +endfunction + + +## ── Helper: compute df_term ───────────────────────────────────────────────── + +function df = __compute_df_term__ (row, cat_flags, pred_names, p) + ## Compute number of design matrix columns for a term. + ## + ## Rules (§8 of reference doc): + ## Numeric predictor: df = 1 + ## Categorical (k levels): df = k - 1 + ## Interaction num × num: df = 1 + ## Interaction num × cat(k): df = k - 1 + ## Interaction cat(j) × cat(k): df = (j-1) × (k-1) + ## Squared numeric (x^2): df = 1 + + active = find (row(1:p)); + if (isempty (active)) + df = 1; ## intercept + return; + endif + + df = 1; + for k = 1:numel (active) + v = active(k); + if (cat_flags(v)) + ## Categorical: need to determine number of levels. + ## Since we don't have the data here, we need to infer from context. + ## For now, store a sentinel value of 0 to be resolved later. + ## Actually, we need the level count. We'll require it as external info. + ## For the initial implementation, use _PriorSteps mechanism or pass + ## level info. But cat_flags alone doesn't tell us k. + ## + ## DESIGN DECISION: cat_flags is a logical vector. We extend it to + ## carry level counts for categorical variables. We pass cat_nlevels + ## as an additional argument or embed it in cat_flags (negative = n_levels). + ## + ## SIMPLER APPROACH: accept cat_flags as a numeric vector where: + ## 0 = numeric, k = number of levels for categorical. + ## This is more information-dense. + ## + ## For now, when cat_flags is logical (boolean), we default to df=1 + ## and let the F-test function compute df dynamically from the design + ## matrix column count difference. This is actually the correct approach + ## because df_term is verified at F-test time. + ## cat_flags can be: + ## - logical true: we default df contribution for this factor = 1 + ## (will be corrected in __stepwiselm_ftest__ using actual design matrix) + ## - numeric > 1: number of levels, contribution = val - 1 + if (isnumeric (cat_flags) && cat_flags(v) > 1) + df = df * (cat_flags(v) - 1); + else + ## Default: will be resolved by design matrix column counting + ## in __stepwiselm_ftest__ + df = df * 1; + endif + else + ## Numeric: 1 column regardless of power + df = df * 1; + endif + endfor +endfunction diff --git a/inst/private/__stepwiselm_ftest__.m b/inst/private/__stepwiselm_ftest__.m new file mode 100644 index 00000000..00b110c2 --- /dev/null +++ b/inst/private/__stepwiselm_ftest__.m @@ -0,0 +1,202 @@ +## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com> +## +## This file is part of the statistics package for GNU Octave. +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Private Function} {[@var{fstat}, @var{pval}, @var{sse_new}, @ +## @var{mse_new}, @var{df_term}, @var{crit_new}] =} @ +## __stepwiselm_ftest__ (@var{raw_X}, @var{raw_y}, @var{weights}, @ +## @var{incl_mask}, @var{current_terms}, @var{candidate}, @ +## @var{cat_flags}, @var{var_names}, @var{pred_names}, @ +## @var{raw_cols}, @var{has_intercept}, @var{criterion}) +## +## Compute the partial F-test statistic for a candidate term action. +## +## This function builds the design matrix for the candidate model using +## @code{__fitlm_build_design__} and calls @code{lm_fit_engine} directly +## (not @code{fitlm}) for maximum performance inside the stepwise loop. +## +## @strong{F-test formula (confirmed §7)} +## +## For @strong{adding} a term: +## @example +## F = (SSE_current - SSE_augmented) / df_term / MSE_augmented +## p = 1 - fcdf(F, df_term, DFE_augmented) +## @end example +## +## For @strong{removing} a term: +## @example +## F = (SSE_reduced - SSE_current) / abs(df_term) / MSE_current +## p = 1 - fcdf(F, abs(df_term), DFE_current) +## @end example +## +## Key rule: the denominator always uses MSE of the @strong{larger} +## (more complex) model. +## +## @seealso{stepwiselm, __stepwiselm_candidates__, lm_fit_engine} +## @end deftypefn + +function [fstat, pval, sse_new, mse_new, df_term, crit_new] = ... + __stepwiselm_ftest__ (raw_X, raw_y, weights, incl_mask, ... + current_terms, candidate, cat_flags, ... + var_names, pred_names, raw_cols, ... + has_intercept, criterion) + + n_pred = numel (pred_names); + excl_mask = ! incl_mask; ## __fitlm_build_design__ expects excl_mask + + ## ── Build design matrices for both current and candidate models ────────── + + if (strcmp (candidate.action, "add")) + ## Adding term: current model is smaller, candidate model is larger + new_terms = [current_terms; candidate.term_row]; + + ## Sort by term order (number of active predictors) for clean design + term_order = sum (new_terms(:, 1:n_pred), 2); + [~, sort_idx] = sort (term_order); + new_terms = new_terms(sort_idx, :); + + ## Build design for current model (smaller) + [~, ~, X_curr, ~, ~, ~, ~, ~] = ... + __fitlm_build_design__ (raw_X, raw_y, current_terms, var_names, ... + cat_flags, weights, excl_mask, pred_names, ... + raw_cols, has_intercept); + + ## Build design for augmented model (larger) + [~, ~, X_new, ~, ~, ~, ~, ~] = ... + __fitlm_build_design__ (raw_X, raw_y, new_terms, var_names, ... + cat_flags, weights, excl_mask, pred_names, ... + raw_cols, has_intercept); + + ## Extract included rows + X_curr_incl = X_curr(incl_mask, :); + X_new_incl = X_new(incl_mask, :); + y_incl = raw_y(incl_mask); + w_incl = weights(incl_mask); + + ## Fit both models + warning ("off", "CompactLinearModel:rankDeficient"); + s_curr = lm_fit_engine (X_curr_incl, y_incl, w_incl); + s_new = lm_fit_engine (X_new_incl, y_incl, w_incl); + warning ("on", "CompactLinearModel:rankDeficient"); + + ## df_term = difference in design matrix columns (correct for categoricals) + df_term = columns (X_new_incl) - columns (X_curr_incl); + if (df_term <= 0) + df_term = 1; ## safety fallback + endif + + ## F = (SSE_curr - SSE_new) / df / MSE_new + ## Denominator is MSE of the LARGER (augmented) model + if (s_new.mse > 0 && df_term > 0) + fstat = (s_curr.sse - s_new.sse) / df_term / s_new.mse; + pval = 1 - fcdf (fstat, df_term, s_new.dfe); + else + fstat = 0; + pval = 1; + endif + + sse_new = s_new.sse; + mse_new = s_new.mse; + crit_new = __compute_criterion__ (s_new, criterion); + + else + ## Removing term: current model is larger, candidate model is smaller + ## Build reduced model (without the term) + red_terms = __remove_term_row__ (current_terms, candidate.term_row); + + ## Build design for current model (larger) + [~, ~, X_curr, ~, ~, ~, ~, ~] = ... + __fitlm_build_design__ (raw_X, raw_y, current_terms, var_names, ... + cat_flags, weights, excl_mask, pred_names, ... + raw_cols, has_intercept); + + ## Build design for reduced model (smaller) + [~, ~, X_red, ~, ~, ~, ~, ~] = ... + __fitlm_build_design__ (raw_X, raw_y, red_terms, var_names, ... + cat_flags, weights, excl_mask, pred_names, ... + raw_cols, has_intercept); + + ## Extract included rows + X_curr_incl = X_curr(incl_mask, :); + X_red_incl = X_red(incl_mask, :); + y_incl = raw_y(incl_mask); + w_incl = weights(incl_mask); + + ## Fit both models + warning ("off", "CompactLinearModel:rankDeficient"); + s_curr = lm_fit_engine (X_curr_incl, y_incl, w_incl); + s_red = lm_fit_engine (X_red_incl, y_incl, w_incl); + warning ("on", "CompactLinearModel:rankDeficient"); + + ## df_term = difference in design matrix columns + df_term = columns (X_curr_incl) - columns (X_red_incl); + if (df_term <= 0) + df_term = 1; + endif + + ## F = (SSE_red - SSE_curr) / df / MSE_curr + ## Denominator is MSE of the LARGER (current) model + if (s_curr.mse > 0 && df_term > 0) + fstat = (s_red.sse - s_curr.sse) / df_term / s_curr.mse; + pval = 1 - fcdf (fstat, df_term, s_curr.dfe); + else + fstat = 0; + pval = 1; + endif + + sse_new = s_red.sse; + mse_new = s_red.mse; + crit_new = __compute_criterion__ (s_red, criterion); + endif + +endfunction + + +## ── Helper: remove a term row from a terms matrix ─────────────────────────── + +function new_terms = __remove_term_row__ (terms, term_row) + ## Remove all rows matching term_row from terms matrix. + mask = true (rows (terms), 1); + for j = 1:rows (terms) + if (isequal (terms(j, :), term_row)) + mask(j) = false; + break; ## remove only the first match + endif + endfor + new_terms = terms(mask, :); +endfunction + + +## ── Helper: compute criterion value from lm_fit_engine output ─────────────── + +function val = __compute_criterion__ (s, criterion) + ## Compute model selection criterion from lm_fit_engine struct. + switch (criterion) + case "sse" + val = s.sse; + case "aic" + val = s.crit.AIC; + case "bic" + val = s.crit.BIC; + case "rsquared" + val = -s.rsq.Ordinary; ## negate so lower = better + case "adjrsquared" + val = -s.rsq.Adjusted; + otherwise + val = s.sse; + endswitch +endfunction diff --git a/inst/private/__stepwiselm_parse_inputs__.m b/inst/private/__stepwiselm_parse_inputs__.m new file mode 100644 index 00000000..f1c30220 --- /dev/null +++ b/inst/private/__stepwiselm_parse_inputs__.m @@ -0,0 +1,138 @@ +## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com> +## +## This file is part of the statistics package for GNU Octave. +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Private Function} {[@var{raw_X}, @var{raw_y}, @var{var_names}, @ +## @var{cat_flags}, @var{weights}, @var{excl_mask}, @var{modelspec}, @ +## @var{obs_names}, @var{resp_name}, @var{pred_names}, @var{has_intercept}, @ +## @var{raw_cols}, @var{lower_spec}, @var{upper_spec}, @var{p_enter}, @ +## @var{p_remove}, @var{criterion}, @var{n_steps}, @var{verbose}] =} @ +## __stepwiselm_parse_inputs__ (@var{varargin}) +## +## Parse stepwiselm inputs. Extracts the seven stepwise-specific name-value +## arguments (Lower, Upper, PEnter, PRemove, Criterion, NSteps, Verbose), +## validates them, then delegates the remaining arguments to +## @code{__fitlm_parse_inputs__} for standard fitlm parsing. +## +## Validation rules: +## @itemize +## @item @code{PEnter} must be strictly less than @code{PRemove}, regardless +## of criterion. This matches MATLAB's behavior (even for AIC/BIC). +## @item @code{Criterion} must be one of @code{'sse'}, @code{'aic'}, +## @code{'bic'}, @code{'rsquared'}, @code{'adjrsquared'}. +## @item @code{NSteps} must be a positive integer or @code{Inf}. +## @item @code{Verbose} must be 0, 1, or 2. +## @end itemize +## +## @seealso{stepwiselm, __fitlm_parse_inputs__} +## @end deftypefn + +function [raw_X, raw_y, var_names, cat_flags, weights, excl_mask, ... + modelspec, obs_names, resp_name, pred_names, has_intercept, ... + raw_cols, lower_spec, upper_spec, p_enter, p_remove, ... + criterion, n_steps, verbose] = ... + __stepwiselm_parse_inputs__ (varargin) + + ## ── Initialize stepwise-specific defaults ───────────────────────────────── + lower_spec = "constant"; + upper_spec = "interactions"; + p_enter = 0.05; + p_remove = 0.10; + criterion = "sse"; + n_steps = Inf; + verbose = 1; + + ## ── Extract stepwise NV args from varargin before passing to fitlm parser ─ + ## We walk through varargin and pull out stepwise-specific keys, + ## leaving everything else for __fitlm_parse_inputs__. + remaining = {}; + stepwise_keys = {"lower", "upper", "penter", "premove", ... + "criterion", "nsteps", "verbose"}; + + i = 1; + while (i <= numel (varargin)) + ## Non-char args (data matrices, table, numeric modelspec) pass through + if (! ischar (varargin{i})) + remaining{end+1} = varargin{i}; + i = i + 1; + continue; + endif + + key_lower = lower (varargin{i}); + + ## Check if this is a stepwise-specific key + if (any (strcmp (key_lower, stepwise_keys))) + if (i + 1 > numel (varargin)) + error ("stepwiselm: name-value argument '%s' has no value.", ... + varargin{i}); + endif + val = varargin{i+1}; + switch (key_lower) + case "lower" + lower_spec = val; + case "upper" + upper_spec = val; + case "penter" + p_enter = double (val); + case "premove" + p_remove = double (val); + case "criterion" + criterion = lower (val); + case "nsteps" + n_steps = double (val); + case "verbose" + verbose = double (val); + endswitch + i = i + 2; + else + ## Not a stepwise key — pass through to fitlm parser + remaining{end+1} = varargin{i}; + i = i + 1; + endif + endwhile + + ## ── Validate stepwise arguments ─────────────────────────────────────────── + + ## PEnter < PRemove always required (matches MATLAB, even for AIC/BIC) + if (p_enter >= p_remove) + error ("stepwiselm: Cannot have PEnter (%g) >= PRemove (%g) for the %s criterion.", ... + p_enter, p_remove, criterion); + endif + + ## Criterion must be valid + valid_criteria = {"sse", "aic", "bic", "rsquared", "adjrsquared"}; + if (! any (strcmp (criterion, valid_criteria))) + error ("stepwiselm: Criterion must be one of: %s.", ... + strjoin (valid_criteria, ", ")); + endif + + ## NSteps must be positive + if (! (isinf (n_steps) || (n_steps > 0 && n_steps == round (n_steps)))) + error ("stepwiselm: NSteps must be a positive integer or Inf."); + endif + + ## Verbose must be 0, 1, or 2 + if (! any (verbose == [0, 1, 2])) + error ("stepwiselm: Verbose must be 0, 1, or 2."); + endif + + ## ── Delegate to fitlm parser ────────────────────────────────────────────── + [raw_X, raw_y, var_names, cat_flags, weights, excl_mask, ... + modelspec, obs_names, resp_name, pred_names, has_intercept, raw_cols] = ... + __fitlm_parse_inputs__ (remaining{:}); + +endfunction diff --git a/inst/private/lm_fit_engine.m b/inst/private/lm_fit_engine.m new file mode 100644 index 00000000..1ca03d43 --- /dev/null +++ b/inst/private/lm_fit_engine.m @@ -0,0 +1,263 @@ +## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com> +## +## This file is part of the statistics package for GNU Octave. +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Private Function} {@var{s} =} lm_fit_engine (@var{X}, @var{y}) +## @deftypefnx {Private Function} {@var{s} =} lm_fit_engine (@var{X}, @var{y}, @var{weights}) +## +## QR-based ordinary least squares regression engine. +## +## This is the single source of truth for all linear regression math in the +## statistics package. It is called by @code{fitlm} (Phase 4) and by +## @code{LinearModel} methods such as @code{addTerms} and @code{removeTerms} +## (Phase 3). +## +## @var{X} is the @math{n x p} design matrix (including intercept column if +## desired). @var{y} is the @math{n x 1} response vector. @var{weights} is +## an optional @math{n x 1} vector of observation weights; pass @code{[]} for +## unweighted OLS. +## +## Returns a struct @var{s} with the following fields: +## +## @table @code +## @item beta +## Coefficient estimates (p x 1). +## @item se +## Standard errors (p x 1). +## @item tstat +## t-statistics (p x 1). +## @item pval +## p-values for two-sided t-test (p x 1). +## @item vcov +## Coefficient covariance matrix (p x p). +## @item yhat +## Fitted values (n x 1). +## @item resid +## Residuals (n x 1). +## @item dfe +## Error degrees of freedom. +## @item sse +## Sum of squared errors (residual sum of squares). +## @item ssr +## Regression sum of squares. +## @item sst +## Total sum of squares (= SSE + SSR). +## @item mse +## Mean squared error (SSE / DFE). +## @item rmse +## Root mean squared error. +## @item logL +## Log-likelihood under normality assumption. +## @item p_est +## Number of estimated (non-rank-deficient) coefficients. +## @item p_tot +## Total number of coefficients in the model. +## @item rank_def +## Logical vector indicating rank-deficient columns. +## @item rsq +## Struct with fields @code{Ordinary} and @code{Adjusted}. +## @item crit +## Struct with fields @code{AIC}, @code{AICc}, @code{BIC}, @code{CAIC}. +## @item n +## Number of observations. +## @item Q +## Economy QR factor for non-deficient columns (n x p_est). Used by +## @code{LinearModel} to compute the hat matrix without a second QR. +## @item h +## Leverage vector (n x 1), equal to @code{diag(Q*Q')} = row sums of +## squares of @var{Q}. +## @end table +## +## @seealso{CompactLinearModel} +## @end deftypefn + +function s = lm_fit_engine (X, y, weights) + + if (nargin < 2) + print_usage (); + endif + if (nargin < 3) + weights = []; + endif + + n = rows (X); + p = columns (X); + + ## Weighted or unweighted QR + if (isempty (weights)) + [Q, R] = qr (X, 0); + Qty = Q' * y; + else + sqw = sqrt (weights(:)); + [Q, R] = qr (bsxfun (@times, sqw, X), 0); + Qty = Q' * (sqw .* y); + endif + + ## Rank deficiency detection + ## Detect before solving to avoid spurious singular-matrix warnings + tol = max ([n, p]) * eps (norm (R, "inf")); + rank_def = abs (diag (R)) < tol; + + ## Solve for beta + if (any (rank_def)) + warning ("CompactLinearModel:rankDeficient", ... + "CompactLinearModel: rank deficient, rank = %d.", ... + sum (! rank_def)); + ## Solve only the non-deficient subsystem + idx_good = find (! rank_def); + beta = zeros (p, 1); + R_good = R(idx_good, idx_good); + beta(idx_good) = R_good \ Qty(idx_good); + else + beta = R \ Qty; + endif + + ## Residuals and sums of squares + yhat = X * beta; + resid = y - yhat; + p_est = sum (! rank_def); ## NumEstimatedCoefficients + dfe = n - p_est; + sse = sum (resid .^ 2); + ybar = mean (y); + ssr = sum ((yhat - ybar) .^ 2); + sst = sse + ssr; ## Always SSE + SSR (not sum((y-ybar)^2)) + mse = sse / dfe; + rmse = sqrt (mse); + + ## Coefficient covariance + ## Use pinv-style via R to handle rank deficiency + Rinv = zeros (p, p); + idx = find (! rank_def); + R_est = R(idx, idx); + Rinv(idx, idx) = R_est \ eye (numel (idx)); + vcov = (Rinv * Rinv') * mse; + + se = sqrt (diag (vcov)); + tstat = beta ./ se; + tstat(rank_def) = NaN; + se(rank_def) = 0; + pval = 2 * (1 - tcdf (abs (tstat), dfe)); + pval(rank_def) = NaN; + + ## Log-likelihood and information criteria + ## Under normality, MLE variance estimate is sigma2 = SSE/n (not MSE = SSE/DFE) + ## This matches MATLAB's computation of LogLikelihood. + sigma2_mle = sse / n; + logL = -n/2 * log (2 * pi * sigma2_mle) - sse / (2 * sigma2_mle); + + ## m = NumEstimatedCoefficients (no +1 for sigma^2) + ## Verified: for n=150, logL=-34.787, p_est=5 -> AIC = -2*(-34.787)+2*5 = 79.574 + m = p_est; + aic = -2 * logL + 2 * m; + aicc = aic + (2 * m * (m + 1)) / (n - m - 1); + bic = -2 * logL + m * log (n); + caic = -2 * logL + m * (log (n) + 1); + + ## Pack output struct + s.beta = beta; + s.se = se; + s.tstat = tstat; + s.pval = pval; + s.vcov = vcov; + s.yhat = yhat; + s.resid = resid; + s.n = n; + s.dfe = dfe; + s.sse = sse; + s.ssr = ssr; + s.sst = sst; + s.mse = mse; + s.rmse = rmse; + s.logL = logL; + s.p_est = p_est; + s.p_tot = p; + s.rank_def = rank_def; + s.rsq = struct ("Ordinary", ssr / sst, ... + "Adjusted", 1 - (sse / dfe) / (sst / (n - 1))); + s.crit = struct ("AIC", aic, "AICc", aicc, "BIC", bic, "CAIC", caic); + + ## QR factor and leverage for LinearModel diagnostics + Q_est = Q(:, ! rank_def); + s.Q = Q_est; + s.h = sum (Q_est .^ 2, 2); + +endfunction + +## Simple 2-predictor OLS — linearly independent columns +%!test +%! X = [ones(5,1), [1;2;3;4;5], [2;1;4;3;6]]; +%! y = X * [1; 2; -0.5]; +%! s = lm_fit_engine (X, y); +%! ## Perfect fit (noise-free) +%! assert (s.beta, [1; 2; -0.5], 1e-10); +%! assert (s.resid, zeros (5, 1), 1e-10); +%! assert (s.p_tot, 3); +%! assert (s.n, 5); + +## Known regression — y = 2 + 3*x with noise-free data +%!test +%! x = (1:10)'; +%! y = 2 + 3 * x; +%! X = [ones(10,1), x]; +%! s = lm_fit_engine (X, y); +%! assert (s.beta, [2; 3], 1e-10); +%! assert (s.sse, 0, 1e-10); +%! assert (s.rsq.Ordinary, 1, 1e-10); + +## Rank-deficient design matrix +%!test +%! warning ("off", "CompactLinearModel:rankDeficient"); +%! X = [1, 2, 4; 1, 3, 6; 1, 4, 8; 1, 5, 10]; ## col3 = 2*col2 +%! y = [1; 2; 3; 4]; +%! s = lm_fit_engine (X, y); +%! assert (s.p_est, 2); +%! assert (s.p_tot, 3); +%! assert (any (s.rank_def), true); +%! ## The rank-deficient coefficient should be zeroed +%! assert (s.beta(s.rank_def), 0); +%! assert (s.se(s.rank_def), 0); +%! assert (isnan (s.tstat(s.rank_def)), true); +%! assert (isnan (s.pval(s.rank_def)), true); +%! warning ("on", "CompactLinearModel:rankDeficient"); + +## SST = SSE + SSR identity +%!test +%! X = [ones(20,1), randn(20,2)]; +%! y = X * [1; 2; -1] + 0.5 * randn (20, 1); +%! s = lm_fit_engine (X, y); +%! assert (s.sst, s.sse + s.ssr, 1e-10); +%! assert (s.dfe, 20 - 3); + +## ModelCriterion — m = p_est (no +1) +## Verify AIC = -2*logL + 2*p_est +%!test +%! X = [ones(20,1), (1:20)']; +%! y = 3 + 2 * (1:20)' + randn (20, 1) * 0.1; +%! s = lm_fit_engine (X, y); +%! assert (s.crit.AIC, -2 * s.logL + 2 * s.p_est, 1e-10); +%! assert (s.crit.BIC, -2 * s.logL + s.p_est * log (s.n), 1e-10); + +## Weighted regression +%!test +%! X = [ones(5,1), [1;2;3;4;5]]; +%! y = [2; 4; 5; 4; 5]; +%! w = [1; 1; 1; 1; 1]; +%! s1 = lm_fit_engine (X, y, []); +%! s2 = lm_fit_engine (X, y, w); +%! ## Equal weights should give same result as unweighted +%! assert (s1.beta, s2.beta, 1e-10); +%! assert (s1.sse, s2.sse, 1e-10); diff --git a/inst/stepwiselm.m b/inst/stepwiselm.m new file mode 100644 index 00000000..37bb28ef --- /dev/null +++ b/inst/stepwiselm.m @@ -0,0 +1,488 @@ +## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com> +## +## This file is part of the statistics package for GNU Octave. +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {statistics} {@var{mdl} =} stepwiselm (@var{X}, @var{y}) +## @deftypefnx {statistics} {@var{mdl} =} stepwiselm (@var{X}, @var{y}, @var{modelspec}) +## @deftypefnx {statistics} {@var{mdl} =} stepwiselm (@var{tbl}) +## @deftypefnx {statistics} {@var{mdl} =} stepwiselm (@var{tbl}, @var{modelspec}) +## @deftypefnx {statistics} {@var{mdl} =} stepwiselm (@dots{}, @var{Name}, @var{Value}) +## +## Fit a linear regression model using stepwise regression. +## +## @code{stepwiselm} performs stepwise regression starting from an initial +## model and iteratively adding or removing terms based on their statistical +## significance, returning a @code{LinearModel} object. +## +## @var{modelspec} specifies the @strong{starting} model (default: +## @qcode{'constant'}). +## +## @strong{Stepwise-specific Name-Value Arguments} +## +## @table @asis +## @item @qcode{'Lower'} — minimum model (default: @qcode{'constant'}) +## @item @qcode{'Upper'} — maximum model (default: @qcode{'interactions'}) +## @item @qcode{'PEnter'} — p-value threshold to add a term (default: 0.05) +## @item @qcode{'PRemove'} — p-value threshold to remove a term (default: 0.10) +## @item @qcode{'Criterion'} — @qcode{'sse'}, @qcode{'aic'}, or @qcode{'bic'} +## @item @qcode{'NSteps'} — maximum number of steps (default: Inf) +## @item @qcode{'Verbose'} — 0 (silent), 1 (one line/step), 2 (per-candidate) +## @end table +## +## All @code{fitlm} name-value arguments are also accepted. +## +## @seealso{fitlm, LinearModel} +## @end deftypefn + +function mdl = stepwiselm (varargin) + + if (nargin < 1) + print_usage (); + endif + + ## ── [1] Parse inputs ────────────────────────────────────────────────────── + [raw_X, raw_y, var_names, cat_flags, weights, excl_mask, ... + modelspec, obs_names, resp_name, pred_names, has_intercept, ... + raw_cols, lower_spec, upper_spec, p_enter, p_remove, ... + criterion, n_steps, verbose] = ... + __stepwiselm_parse_inputs__ (varargin{:}); + + n_pred = numel (pred_names); + + ## ── [2] Build lower/upper terms matrices ────────────────────────────────── + lower_terms_pred = __spec_to_terms__ (lower_spec, n_pred, true); + upper_terms_pred = __spec_to_terms__ (upper_spec, n_pred, true); + + ## Add response column (always 0) + lower_terms = [lower_terms_pred, zeros(rows (lower_terms_pred), 1)]; + upper_terms = [upper_terms_pred, zeros(rows (upper_terms_pred), 1)]; + + ## ── [3] Fit starting model ──────────────────────────────────────────────── + cat_idx = find (cat_flags(1:n_pred)); + fitlm_args = {}; + if (! isempty (cat_idx)) + fitlm_args = [fitlm_args, {'CategoricalVars', cat_idx}]; + endif + if (any (weights != 1)) + fitlm_args = [fitlm_args, {'Weights', weights}]; + endif + if (any (excl_mask)) + fitlm_args = [fitlm_args, {'Exclude', excl_mask}]; + endif + if (! isempty (var_names)) + fitlm_args = [fitlm_args, {'VarNames', var_names}]; + endif + + if (isa (raw_X, "table")) + mdl0 = fitlm (raw_X, modelspec, fitlm_args{:}); + else + mdl0 = fitlm (raw_X, raw_y, modelspec, fitlm_args{:}); + endif + + current_terms = mdl0.Formula.Terms; + incl_mask = mdl0.ObservationInfo.Subset; + + ## ── [4] Initialize step log ─────────────────────────────────────────────── + start_lp = mdl0.Formula.LinearPredictor; + crit_current = __get_model_crit__ (mdl0, criterion); + + n_log = 1; + log_action = {'Start'}; + log_term = {start_lp}; + log_terms = {current_terms}; + log_df = mdl0.NumCoefficients; + log_del_df = NaN; + log_fstat = NaN; + log_pval = NaN; + log_critval = crit_current; + + ## ── [5] Check for empty candidate set ───────────────────────────────────── + all_candidates = __stepwiselm_candidates__ (current_terms, lower_terms, ... + upper_terms, cat_flags, ... + pred_names); + if (isempty (all_candidates) && verbose >= 1) + fprintf ("No terms to add to or remove from initial model.\n"); + endif + + ## ── [6] Search loop ─────────────────────────────────────────────────────── + n_taken = 0; + use_ftest = strcmp (criterion, "sse"); + + while (n_taken < n_steps) + made_change = false; + + ## ── Phase A: BACKWARD SCAN (keep removing) ──────────────────────────── + while (n_taken < n_steps) + candidates = __stepwiselm_candidates__ (current_terms, lower_terms, ... + upper_terms, cat_flags, ... + pred_names); + ## Filter to remove only + rem_idx = []; + for ci = 1:numel (candidates) + if (strcmp (candidates(ci).action, "remove")) + rem_idx(end+1) = ci; + endif + endfor + + if (isempty (rem_idx)) + break; + endif + + ## Score each removal candidate + best_ri = 0; + best_pval = -Inf; + best_fstat = 0; + best_crit = Inf; + best_df = 0; + + for ri = 1:numel (rem_idx) + c = candidates(rem_idx(ri)); + [f, p, ~, ~, df_t, cr] = __stepwiselm_ftest__ ( ... + raw_X, raw_y, weights, incl_mask, current_terms, c, ... + cat_flags, var_names, pred_names, raw_cols, has_intercept, ... + criterion); + + if (verbose >= 2) + if (use_ftest) + fprintf (" pValue for removing %s is %g\n", c.term_name, p); + else + fprintf (" %s for removing %s is %g\n", ... + upper (criterion), c.term_name, cr); + endif + endif + + should_take = false; + if (use_ftest) + if (p > p_remove && p > best_pval) + should_take = true; + endif + else + if (cr < crit_current && cr < best_crit) + should_take = true; + endif + endif + + if (should_take) + best_ri = rem_idx(ri); + best_pval = p; + best_fstat = f; + best_crit = cr; + best_df = df_t; + endif + endfor + + if (best_ri == 0) + break; ## no qualifying removal + endif + + ## Apply removal + best_c = candidates(best_ri); + mask = true (rows (current_terms), 1); + for j = 1:rows (current_terms) + if (isequal (current_terms(j, :), best_c.term_row)) + mask(j) = false; + break; + endif + endfor + current_terms = current_terms(mask, :); + n_taken++; + made_change = true; + + ## Update criterion + if (! use_ftest) + crit_current = best_crit; + endif + + ## Log + n_log++; + log_action{n_log} = "Remove"; + log_term{n_log} = best_c.term_name; + log_terms{n_log} = current_terms; + log_df(n_log) = log_df(n_log - 1) - best_df; + log_del_df(n_log) = -best_df; + log_fstat(n_log) = best_fstat; + log_pval(n_log) = best_pval; + log_critval(n_log) = best_crit; + + ## Verbose + if (verbose >= 1) + __print_step__ (n_taken, best_c, best_fstat, best_pval, ... + best_crit, criterion); + endif + endwhile ## backward + + if (n_taken >= n_steps) + break; + endif + + ## ── Phase B: FORWARD SCAN (add one) ─────────────────────────────────── + candidates = __stepwiselm_candidates__ (current_terms, lower_terms, ... + upper_terms, cat_flags, ... + pred_names); + ## Filter to add only + add_idx = []; + for ci = 1:numel (candidates) + if (strcmp (candidates(ci).action, "add")) + add_idx(end+1) = ci; + endif + endfor + + if (isempty (add_idx)) + break; ## converged + endif + + ## Score each add candidate + best_ai = 0; + best_pval_a = Inf; + best_fstat_a = 0; + best_crit_a = Inf; + best_df_a = 0; + + for ai = 1:numel (add_idx) + c = candidates(add_idx(ai)); + [f, p, ~, ~, df_t, cr] = __stepwiselm_ftest__ ( ... + raw_X, raw_y, weights, incl_mask, current_terms, c, ... + cat_flags, var_names, pred_names, raw_cols, has_intercept, ... + criterion); + + if (verbose >= 2) + if (use_ftest) + fprintf (" pValue for adding %s is %g\n", c.term_name, p); + else + fprintf (" %s for adding %s is %g\n", ... + upper (criterion), c.term_name, cr); + endif + endif + + should_take = false; + if (use_ftest) + if (p < p_enter && p < best_pval_a) + should_take = true; + endif + else + if (cr < crit_current && cr < best_crit_a) + should_take = true; + endif + endif + + if (should_take) + best_ai = add_idx(ai); + best_pval_a = p; + best_fstat_a = f; + best_crit_a = cr; + best_df_a = df_t; + endif + endfor + + if (best_ai == 0) + if (! made_change) + break; ## converged — no add and no remove happened + endif + break; + endif + + ## Apply addition + best_c = candidates(best_ai); + current_terms = [current_terms; best_c.term_row]; + ## Sort by term order + term_order = sum (current_terms(:, 1:n_pred), 2); + [~, si] = sort (term_order); + current_terms = current_terms(si, :); + n_taken++; + made_change = true; + + ## Update criterion + if (! use_ftest) + crit_current = best_crit_a; + endif + + ## Log + n_log++; + log_action{n_log} = "Add"; + log_term{n_log} = best_c.term_name; + log_terms{n_log} = current_terms; + log_df(n_log) = log_df(n_log - 1) + best_df_a; + log_del_df(n_log) = best_df_a; + log_fstat(n_log) = best_fstat_a; + log_pval(n_log) = best_pval_a; + log_critval(n_log) = best_crit_a; + + ## Verbose + if (verbose >= 1) + __print_step__ (n_taken, best_c, best_fstat_a, best_pval_a, ... + best_crit_a, criterion); + endif + + ## Continue outer loop → back to backward scan + endwhile + + ## ── [7] Fit final model ─────────────────────────────────────────────────── + if (isa (raw_X, "table")) + mdl = fitlm (raw_X, current_terms, fitlm_args{:}); + else + mdl = fitlm (raw_X, raw_y, current_terms, fitlm_args{:}); + endif + + ## ── [8] Build Steps struct ──────────────────────────────────────────────── + Steps.Start = [resp_name, " ~ ", log_term{1}]; + Steps.Lower = lower_spec; + Steps.Upper = upper_spec; + Steps.Criterion = upper (criterion); + Steps.PEnter = p_enter; + Steps.PRemove = p_remove; + + ## Build History struct (mimicking a table) + History.Action = log_action(:); + History.TermName = cell (n_log, 1); + for k = 1:n_log + History.TermName{k} = {log_term{k}}; + endfor + History.Terms = log_terms(:); + History.DF = log_df(:); + History.delDF = log_del_df(:); + + if (use_ftest) + History.FStat = log_fstat(:); + History.pValue = log_pval(:); + else + ## For AIC/BIC, the column is the criterion name, not FStat/pValue + History.(upper (criterion)) = log_critval(:); + endif + + Steps.History = History; + + ## ── [9] Inject Steps into LinearModel ───────────────────────────────────── + mdl.Steps = Steps; + +endfunction + + +## ═══════════════════════════════════════════════════════════════════════════════ +## Local helper functions +## ═══════════════════════════════════════════════════════════════════════════════ + +function terms = __spec_to_terms__ (spec, p, has_intercept) + ## Convert a shorthand modelspec string to a (n_terms x p) terms matrix. + ## Also handles formula strings 'y ~ x1 + x2' and numeric matrices. + + if (isnumeric (spec) && ismatrix (spec)) + terms = spec; + ## Strip response column if present + if (columns (terms) == p + 1) + terms = terms(:, 1:p); + endif + return; + endif + + if (! ischar (spec)) + spec = "interactions"; + endif + + ## Check for formula string + if (! isempty (strfind (spec, "~"))) + ## Formula — extract terms. For upper/lower bounds this is unusual + ## but handle gracefully: just use 'interactions' as fallback + spec = "interactions"; + endif + + switch (lower (spec)) + case "constant" + terms = zeros (1, p); + + case "linear" + terms = [zeros(1, p); eye(p)]; + + case "interactions" + lin = eye (p); + inter = []; + for i = 1:p-1 + for j = i+1:p + r = zeros (1, p); + r(i) = 1; r(j) = 1; + inter = [inter; r]; + endfor + endfor + terms = [zeros(1, p); lin; inter]; + + case "purequadratic" + terms = [zeros(1, p); eye(p); 2*eye(p)]; + + case {"quadratic", "full"} + lin = eye (p); + inter = []; + for i = 1:p-1 + for j = i+1:p + r = zeros (1, p); + r(i) = 1; r(j) = 1; + inter = [inter; r]; + endfor + endfor + terms = [zeros(1, p); lin; inter; 2*eye(p)]; + + otherwise + terms = [zeros(1, p); eye(p)]; + endswitch +endfunction + + +function val = __get_model_crit__ (mdl, criterion) + switch (criterion) + case "sse" + val = mdl.SSE; + case "aic" + val = mdl.ModelCriterion.AIC; + case "bic" + val = mdl.ModelCriterion.BIC; + case "rsquared" + val = -mdl.Rsquared.Ordinary; + case "adjrsquared" + val = -mdl.Rsquared.Adjusted; + otherwise + val = mdl.SSE; + endswitch +endfunction + + +function __print_step__ (step_num, candidate, fstat, pval, critval, criterion) + action_str = [toupper(candidate.action(1)), candidate.action(2:end)]; + action_verb = "Adding"; + if (strcmp (candidate.action, "remove")) + action_verb = "Removing"; + endif + + if (strcmp (criterion, "sse")) + fprintf ("%d. %s %s, FStat = %g, pValue = %g\n", ... + step_num, action_verb, candidate.term_name, fstat, pval); + else + fprintf ("%d. %s %s, %s = %g\n", ... + step_num, action_verb, candidate.term_name, ... + upper (criterion), critval); + endif +endfunction + + +## ═══════════════════════════════════════════════════════════════════════════════ +## Unit tests +## ═══════════════════════════════════════════════════════════════════════════════ + +## Test 1: PEnter >= PRemove error +%!error +%! rng (1); +%! X = randn (20, 3); +%! y = randn (20, 1); +%! stepwiselm (X, y, 'constant', 'PEnter', 0.1, 'PRemove', 0.05); +