diff --git a/inst/CompactLinearModel.m b/inst/CompactLinearModel.m
new file mode 100644
index 00000000..22ed45cb
--- /dev/null
+++ b/inst/CompactLinearModel.m
@@ -0,0 +1,786 @@
+## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com>
+##
+## This file is part of the statistics package for GNU Octave.
+##
+## This program is free software; you can redistribute it and/or modify it under
+## the terms of the GNU General Public License as published by the Free Software
+## Foundation; either version 3 of the License, or (at your option) any later
+## version.
+##
+## This program is distributed in the hope that it will be useful, but WITHOUT
+## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+## details.
+##
+## You should have received a copy of the GNU General Public License along with
+## this program; if not, see .
+
+classdef CompactLinearModel < handle
+## -*- texinfo -*-
+## @deftp {statistics} CompactLinearModel
+##
+## Compact linear regression model.
+##
+## A @code{CompactLinearModel} object is a compact representation of a linear
+## regression model. It contains the regression results (coefficient
+## estimates, goodness-of-fit statistics, and model information) but does not
+## contain the training data, residuals, or observation-level diagnostics.
+##
+## A @code{CompactLinearModel} object is created by calling @code{compact} on
+## a @code{LinearModel} object. Users cannot construct a
+## @code{CompactLinearModel} directly.
+##
+## @strong{Properties}
+##
+## All properties are read-only.
+##
+## @multitable @columnfractions 0.30 0.70
+## @headitem Property @tab Description
+## @item @code{Coefficients} @tab Table of coefficient estimates, SE, tStat,
+## and pValue.
+## @item @code{CoefficientCovariance} @tab Variance-covariance matrix of the
+## coefficient estimates.
+## @item @code{CoefficientNames} @tab Cell array of coefficient names.
+## @item @code{DFE} @tab Error degrees of freedom.
+## @item @code{Formula} @tab Struct describing the model formula.
+## @item @code{LogLikelihood} @tab Log-likelihood of the fitted model.
+## @item @code{ModelCriterion} @tab Struct with AIC, AICc, BIC, CAIC.
+## @item @code{MSE} @tab Mean squared error.
+## @item @code{NumCoefficients} @tab Total number of coefficients.
+## @item @code{NumEstimatedCoefficients} @tab Number of estimated (non-rank-
+## deficient) coefficients.
+## @item @code{NumObservations} @tab Number of observations used in fitting.
+## @item @code{NumPredictors} @tab Number of predictor variables.
+## @item @code{NumVariables} @tab Total number of variables.
+## @item @code{PredictorNames} @tab Cell array of predictor variable names.
+## @item @code{ResponseName} @tab Name of the response variable.
+## @item @code{RMSE} @tab Root mean squared error.
+## @item @code{Robust} @tab Robust fitting information (empty for OLS).
+## @item @code{Rsquared} @tab Struct with Ordinary and Adjusted R-squared.
+## @item @code{SSE} @tab Sum of squared errors.
+## @item @code{SSR} @tab Regression sum of squares.
+## @item @code{SST} @tab Total sum of squares.
+## @item @code{VariableInfo} @tab Table of variable metadata.
+## @item @code{VariableNames} @tab Cell array of all variable names.
+## @end multitable
+##
+## @strong{Methods}
+##
+## @multitable @columnfractions 0.30 0.70
+## @headitem Method @tab Description
+## @item @code{predict} @tab Predict response for new data.
+## @item @code{coefCI} @tab Confidence intervals for coefficients.
+## @item @code{coefTest} @tab Linear hypothesis test on coefficients.
+## @end multitable
+##
+## @seealso{fitlm}
+## @end deftp
+
+ properties (SetAccess = protected)
+
+ ## Coefficient estimates
+ Coefficients = [];
+ CoefficientNames = {};
+ CoefficientCovariance = [];
+ NumCoefficients = 0;
+ NumEstimatedCoefficients = 0;
+
+ ## Summary statistics
+ DFE = 0;
+ SSE = 0;
+ SSR = 0;
+ SST = 0;
+ MSE = 0;
+ RMSE = 0;
+ Rsquared = [];
+ LogLikelihood = 0;
+ ModelCriterion = [];
+
+ ## Fitting method
+ Robust = [];
+
+ ## Input data info (survives compact())
+ Formula = [];
+ NumObservations = 0;
+ NumPredictors = 0;
+ NumVariables = 0;
+ PredictorNames = {};
+ ResponseName = '';
+ VariableInfo = [];
+ VariableNames = {};
+
+ endproperties
+
+ ## Constructor — Access = protected so only LinearModel (subclass) or
+ ## internal factory methods can create CompactLinearModel objects.
+ ## When Octave supports (Access = ?LinearModel), switch to that.
+ methods (Access = protected)
+
+ function obj = CompactLinearModel (s)
+ ## CompactLinearModel Constructor (internal use only).
+ ##
+ ## obj = CompactLinearModel (s)
+ ##
+ ## S is a struct containing all fields needed to populate the object.
+ ## This constructor is called by LinearModel.compact() or by fitlm's
+ ## internal builder. Users should not call it directly.
+
+ if (nargin < 1)
+ ## Allow zero-arg construction for Octave classdef internals
+ return;
+ endif
+
+ if (! isstruct (s))
+ error ("CompactLinearModel: constructor argument must be a struct.");
+ endif
+
+ ## Coefficient results
+ obj.Coefficients = s.Coefficients;
+ obj.CoefficientNames = s.CoefficientNames;
+ obj.CoefficientCovariance = s.CoefficientCovariance;
+ obj.NumCoefficients = s.NumCoefficients;
+ obj.NumEstimatedCoefficients = s.NumEstimatedCoefficients;
+
+ ## Summary statistics
+ obj.DFE = s.DFE;
+ obj.SSE = s.SSE;
+ obj.SSR = s.SSR;
+ obj.SST = s.SST;
+ obj.MSE = s.MSE;
+ obj.RMSE = s.RMSE;
+ obj.Rsquared = s.Rsquared;
+ obj.LogLikelihood = s.LogLikelihood;
+ obj.ModelCriterion = s.ModelCriterion;
+
+ ## Fitting method
+ obj.Robust = s.Robust;
+
+ ## Formula and variable info
+ obj.Formula = s.Formula;
+ obj.NumObservations = s.NumObservations;
+ obj.NumPredictors = s.NumPredictors;
+ obj.NumVariables = s.NumVariables;
+ obj.PredictorNames = s.PredictorNames;
+ obj.ResponseName = s.ResponseName;
+ obj.VariableInfo = s.VariableInfo;
+ obj.VariableNames = s.VariableNames;
+
+ endfunction
+
+ endmethods
+
+ ## Hidden methods: display, disp, subsref
+ methods (Hidden)
+
+ ## Custom display — called when variable name is typed at prompt
+ function display (obj)
+ in_name = inputname (1);
+ if (! isempty (in_name))
+ fprintf ("%s =\n", in_name);
+ endif
+ disp (obj);
+ endfunction
+
+ ## Custom disp — replicates MATLAB's LinearModel display format
+ function disp (obj)
+
+ fprintf ("\n");
+
+ ## Show "Compact" prefix for CompactLinearModel, not for LinearModel
+ if (strcmp (class (obj), "CompactLinearModel"))
+ fprintf ("Compact linear regression model:\n");
+ else
+ fprintf ("Linear regression model:\n");
+ endif
+
+ ## Formula line: ResponseName ~ LinearPredictor
+ lp_str = "";
+ if (isa (obj.Formula, "LinearFormula"))
+ lp_str = obj.Formula.LinearPredictor;
+ elseif (isstruct (obj.Formula) && isfield (obj.Formula, "LinearPredictor"))
+ lp_str = obj.Formula.LinearPredictor;
+ endif
+ if (! isempty (lp_str))
+ if (! isempty (obj.ResponseName))
+ fprintf (" %s ~ %s\n", obj.ResponseName, lp_str);
+ else
+ fprintf (" %s\n", lp_str);
+ endif
+ endif
+
+ fprintf ("\n");
+ fprintf ("Estimated Coefficients:\n");
+
+ ## --- Build coefficient table display ---
+ coef_names = obj.CoefficientNames;
+ p = numel (coef_names);
+
+ ## Extract numeric vectors from Coefficients
+ if (isstruct (obj.Coefficients))
+ est = obj.Coefficients.Estimate;
+ se = obj.Coefficients.SE;
+ ts = obj.Coefficients.tStat;
+ pv = obj.Coefficients.pValue;
+ elseif (isa (obj.Coefficients, "table"))
+ est = obj.Coefficients.Estimate;
+ se = obj.Coefficients.SE;
+ ts = obj.Coefficients.tStat;
+ pv = obj.Coefficients.pValue;
+ else
+ est = []; se = []; ts = []; pv = [];
+ endif
+
+ if (! isempty (est))
+ ## Column headers
+ hdr_fmt = " %-20s %12s %12s %12s %12s\n";
+ fprintf (hdr_fmt, "", "Estimate", "SE", "tStat", "pValue");
+
+ ## Separator line
+ sep_fmt = " %-20s %12s %12s %12s %12s\n";
+ fprintf (sep_fmt, "", "________", "________", "________", "__________");
+ fprintf ("\n");
+
+ ## Data rows
+ for i = 1:p
+ row_fmt = " %-20s %12.6g %12.6g %12.6g %12.4g\n";
+ fprintf (row_fmt, coef_names{i}, est(i), se(i), ts(i), pv(i));
+ endfor
+ endif
+
+ fprintf ("\n");
+
+ ## Summary footer
+ fprintf ("Number of observations: %d, Error degrees of freedom: %d\n", ...
+ obj.NumObservations, obj.DFE);
+ fprintf ("Root Mean Squared Error: %.4g\n", obj.RMSE);
+
+ if (isstruct (obj.Rsquared))
+ fprintf ("R-squared: %.4g, Adjusted R-Squared: %.4g\n", ...
+ obj.Rsquared.Ordinary, obj.Rsquared.Adjusted);
+ endif
+
+ ## F-statistic vs constant model
+ ## F = (SSR / (NumEstimatedCoefficients - 1)) / MSE
+ if (obj.NumEstimatedCoefficients > 1 && obj.MSE > 0)
+ df1 = obj.NumEstimatedCoefficients - 1;
+ F_val = (obj.SSR / df1) / obj.MSE;
+ F_pval = 1 - fcdf (F_val, df1, obj.DFE);
+ fprintf ("F-statistic vs. constant model: %.4g, p-value = %.4g\n", ...
+ F_val, F_pval);
+ endif
+
+ fprintf ("\n");
+
+ endfunction
+
+ ## Custom subsref — supports chained dot indexing for properties
+ function varargout = subsref (obj, s)
+ chain_s = s(2:end);
+ s = s(1);
+ switch (s.type)
+ case "()"
+ error (strcat ("Invalid () indexing for referencing values", ...
+ " in a CompactLinearModel object."));
+ case "{}"
+ error (strcat ("Invalid {} indexing for referencing values", ...
+ " in a CompactLinearModel object."));
+ case "."
+ if (! ischar (s.subs))
+ error (strcat ("CompactLinearModel.subsref: '.'", ...
+ " indexing argument must be a character vector."));
+ endif
+ ## Check if it's a method call
+ switch (s.subs)
+ case "predict"
+ if (! isempty (chain_s) && strcmp (chain_s(1).type, "()"))
+ args = chain_s(1).subs;
+ chain_s = chain_s(2:end);
+ [varargout{1:nargout}] = predict (obj, args{:});
+ else
+ error ("CompactLinearModel: predict requires arguments.");
+ endif
+ if (! isempty (chain_s))
+ [varargout{1:nargout}] = subsref (varargout{1}, chain_s);
+ endif
+ return;
+ case "coefCI"
+ if (! isempty (chain_s) && strcmp (chain_s(1).type, "()"))
+ args = chain_s(1).subs;
+ chain_s = chain_s(2:end);
+ [varargout{1:nargout}] = coefCI (obj, args{:});
+ else
+ [varargout{1:nargout}] = coefCI (obj);
+ endif
+ if (! isempty (chain_s))
+ [varargout{1:nargout}] = subsref (varargout{1}, chain_s);
+ endif
+ return;
+ case "coefTest"
+ if (! isempty (chain_s) && strcmp (chain_s(1).type, "()"))
+ args = chain_s(1).subs;
+ chain_s = chain_s(2:end);
+ [varargout{1:nargout}] = coefTest (obj, args{:});
+ else
+ [varargout{1:nargout}] = coefTest (obj);
+ endif
+ if (! isempty (chain_s))
+ [varargout{1:nargout}] = subsref (varargout{1}, chain_s);
+ endif
+ return;
+ case "compact"
+ [varargout{1:nargout}] = compact (obj);
+ if (! isempty (chain_s))
+ [varargout{1:nargout}] = subsref (varargout{1}, chain_s);
+ endif
+ return;
+ case "addTerms"
+ if (! isempty (chain_s) && strcmp (chain_s(1).type, "()"))
+ args = chain_s(1).subs;
+ chain_s = chain_s(2:end);
+ [varargout{1:nargout}] = addTerms (obj, args{:});
+ else
+ error ("CompactLinearModel: addTerms requires arguments.");
+ endif
+ if (! isempty (chain_s))
+ [varargout{1:nargout}] = subsref (varargout{1}, chain_s);
+ endif
+ return;
+ case "removeTerms"
+ if (! isempty (chain_s) && strcmp (chain_s(1).type, "()"))
+ args = chain_s(1).subs;
+ chain_s = chain_s(2:end);
+ [varargout{1:nargout}] = removeTerms (obj, args{:});
+ else
+ error ("CompactLinearModel: removeTerms requires arguments.");
+ endif
+ if (! isempty (chain_s))
+ [varargout{1:nargout}] = subsref (varargout{1}, chain_s);
+ endif
+ return;
+ case "step"
+ if (! isempty (chain_s) && strcmp (chain_s(1).type, "()"))
+ args = chain_s(1).subs;
+ chain_s = chain_s(2:end);
+ [varargout{1:nargout}] = step (obj, args{:});
+ else
+ [varargout{1:nargout}] = step (obj);
+ endif
+ if (! isempty (chain_s))
+ [varargout{1:nargout}] = subsref (varargout{1}, chain_s);
+ endif
+ return;
+ case "anova"
+ if (! isempty (chain_s) && strcmp (chain_s(1).type, "()"))
+ args = chain_s(1).subs;
+ chain_s = chain_s(2:end);
+ [varargout{1:nargout}] = anova (obj, args{:});
+ else
+ [varargout{1:nargout}] = anova (obj);
+ endif
+ if (! isempty (chain_s))
+ [varargout{1:nargout}] = subsref (varargout{1}, chain_s);
+ endif
+ return;
+ case "feval"
+ if (! isempty (chain_s) && strcmp (chain_s(1).type, "()"))
+ args = chain_s(1).subs;
+ chain_s = chain_s(2:end);
+ [varargout{1:nargout}] = predict (obj, args{:});
+ else
+ error ("CompactLinearModel: feval requires arguments.");
+ endif
+ if (! isempty (chain_s))
+ [varargout{1:nargout}] = subsref (varargout{1}, chain_s);
+ endif
+ return;
+ case "disp"
+ disp (obj);
+ return;
+ endswitch
+ ## Property access
+ try
+ out = obj.(s.subs);
+ catch
+ error ("CompactLinearModel.subsref: unrecognized property: '%s'.", ...
+ s.subs);
+ end_try_catch
+ endswitch
+ ## Chained references
+ if (! isempty (chain_s))
+ out = subsref (out, chain_s);
+ endif
+ varargout{1} = out;
+ endfunction
+
+ ## Prevent direct property assignment by users
+ function obj = subsasgn (obj, s, val)
+ if (numel (s) > 1)
+ error (strcat ("CompactLinearModel.subsasgn:", ...
+ " chained subscripts not allowed."));
+ endif
+ switch (s.type)
+ case "()"
+ error (strcat ("Invalid () indexing for assigning values", ...
+ " to a CompactLinearModel object."));
+ case "{}"
+ error (strcat ("Invalid {} indexing for assigning values", ...
+ " to a CompactLinearModel object."));
+ case "."
+ error (strcat ("CompactLinearModel.subsasgn:", ...
+ " all properties are read-only: '%s'."), s.subs);
+ endswitch
+ endfunction
+
+ endmethods
+
+ ## Public methods: predict, coefCI, coefTest
+ methods (Access = public)
+
+ ## -*- texinfo -*-
+ ## @deftypefn {CompactLinearModel} {@var{ypred} =} predict (@var{obj}, @var{Xnew})
+ ## @deftypefnx {CompactLinearModel} {[@var{ypred}, @var{yci}] =} predict (@var{obj}, @var{Xnew})
+ ## @deftypefnx {CompactLinearModel} {[@var{ypred}, @var{yci}] =} predict (@dots{}, @var{Name}, @var{Value})
+ ##
+ ## Predict response for new data using a compact linear regression model.
+ ##
+ ## @code{@var{ypred} = predict (@var{obj}, @var{Xnew})} returns the predicted
+ ## response values for the predictor data in @var{Xnew} using the fitted
+ ## model @var{obj}. @var{Xnew} must be a numeric matrix with the same
+ ## number of predictor columns as used to fit the model (excluding
+ ## intercept — it is added automatically if the model has one).
+ ##
+ ## @code{[@var{ypred}, @var{yci}] = predict (@dots{})} also returns
+ ## confidence or prediction intervals in the @math{n x 2} matrix @var{yci}.
+ ##
+ ## Additional options can be specified as @qcode{Name-Value} pairs:
+ ##
+ ## @table @code
+ ## @item Alpha
+ ## Significance level for the confidence/prediction intervals.
+ ## Default is 0.05 (95% intervals).
+ ##
+ ## @item Prediction
+ ## Type of interval: @qcode{'curve'} for confidence intervals on the
+ ## mean response (default), or @qcode{'observation'} for prediction
+ ## intervals on individual new observations.
+ ##
+ ## @item Simultaneous
+ ## If @code{true}, compute simultaneous Scheffé confidence bands.
+ ## Default is @code{false} (pointwise intervals).
+ ## @end table
+ ##
+ ## @seealso{CompactLinearModel, coefCI, coefTest}
+ ## @end deftypefn
+
+ function [ypred, yci] = predict (obj, Xnew, varargin)
+
+ if (nargin < 2)
+ error ("CompactLinearModel.predict: Xnew is required.");
+ endif
+
+ ## Parse name-value pairs
+ alpha = 0.05;
+ pred_obs = false; ## false = curve (CI on mean), true = observation (PI)
+ simult = false;
+
+ i = 1;
+ while (i <= numel (varargin))
+ if (! ischar (varargin{i}))
+ error ("CompactLinearModel.predict: expected parameter name string.");
+ endif
+ switch (lower (varargin{i}))
+ case "alpha"
+ alpha = varargin{i+1};
+ i = i + 2;
+ case "prediction"
+ pred_obs = strcmpi (varargin{i+1}, "observation");
+ i = i + 2;
+ case "simultaneous"
+ simult = varargin{i+1};
+ i = i + 2;
+ otherwise
+ error ("CompactLinearModel.predict: unknown parameter '%s'.", ...
+ varargin{i});
+ endswitch
+ endwhile
+
+ ## Determine intercept flag
+ has_int = false;
+ if (isa (obj.Formula, "LinearFormula"))
+ has_int = obj.Formula.HasIntercept;
+ elseif (isstruct (obj.Formula) && isfield (obj.Formula, "HasIntercept"))
+ has_int = obj.Formula.HasIntercept;
+ endif
+
+ ## Unpack table input into numeric design matrix
+ if (isa (Xnew, "table"))
+ Xnew = __predict_table_to_Xmat__ (obj, Xnew, has_int);
+ ## Xnew is now the full design matrix (with intercept if applicable)
+ Xmat = Xnew;
+ else
+ ## Standard numeric matrix path
+ if (has_int)
+ Xmat = [ones(rows (Xnew), 1), Xnew];
+ else
+ Xmat = Xnew;
+ endif
+ endif
+
+ ## Check dimensions
+ if (columns (Xmat) != obj.NumCoefficients)
+ error (strcat ("CompactLinearModel.predict: Xnew has wrong", ...
+ " number of columns. Expected %d predictor columns."), ...
+ obj.NumCoefficients - double (obj.Formula.HasIntercept));
+ endif
+
+ ## Extract coefficient estimates
+ if (isa (obj.Coefficients, "table"))
+ beta = obj.Coefficients.Estimate;
+ elseif (isstruct (obj.Coefficients))
+ beta = obj.Coefficients.Estimate;
+ else
+ error ("CompactLinearModel.predict: invalid Coefficients format.");
+ endif
+
+ ## Point predictions
+ ypred = Xmat * beta;
+
+ ## Compute intervals if requested
+ if (nargout > 1)
+ se_fit = sqrt (sum ((Xmat * obj.CoefficientCovariance) .* Xmat, 2));
+
+ if (pred_obs)
+ se_pred = sqrt (se_fit .^ 2 + obj.MSE);
+ else
+ se_pred = se_fit;
+ endif
+
+ if (simult)
+ p_eff = obj.NumEstimatedCoefficients;
+ F_crit = finv (1 - alpha, p_eff, obj.DFE);
+ hw = sqrt (p_eff * F_crit) .* se_pred;
+ else
+ hw = tinv (1 - alpha / 2, obj.DFE) .* se_pred;
+ endif
+
+ yci = [ypred - hw, ypred + hw];
+ endif
+
+ endfunction
+
+ ## -*- texinfo -*-
+ ## @deftypefn {CompactLinearModel} {@var{ci} =} coefCI (@var{obj})
+ ## @deftypefnx {CompactLinearModel} {@var{ci} =} coefCI (@var{obj}, @var{alpha})
+ ##
+ ## Confidence intervals for coefficient estimates.
+ ##
+ ## @code{@var{ci} = coefCI (@var{obj})} returns a @math{p x 2} matrix
+ ## of 95% confidence intervals for the coefficient estimates, where
+ ## @math{p} is the number of coefficients.
+ ##
+ ## @code{@var{ci} = coefCI (@var{obj}, @var{alpha})} specifies the
+ ## significance level. For example, @code{alpha = 0.01} gives 99%
+ ## confidence intervals.
+ ##
+ ## @seealso{CompactLinearModel, predict, coefTest}
+ ## @end deftypefn
+
+ function ci = coefCI (obj, alpha)
+
+ if (nargin < 2)
+ alpha = 0.05;
+ endif
+
+ t_crit = tinv (1 - alpha / 2, obj.DFE);
+
+ ## Extract estimates and SEs
+ if (isa (obj.Coefficients, "table"))
+ est = obj.Coefficients.Estimate;
+ se = obj.Coefficients.SE;
+ elseif (isstruct (obj.Coefficients))
+ est = obj.Coefficients.Estimate;
+ se = obj.Coefficients.SE;
+ else
+ error ("CompactLinearModel.coefCI: invalid Coefficients format.");
+ endif
+
+ ci = [est - t_crit * se, est + t_crit * se];
+
+ endfunction
+
+ ## -*- texinfo -*-
+ ## @deftypefn {CompactLinearModel} {[@var{pval}, @var{F}, @var{r}] =} coefTest (@var{obj})
+ ## @deftypefnx {CompactLinearModel} {[@var{pval}, @var{F}, @var{r}] =} coefTest (@var{obj}, @var{H})
+ ## @deftypefnx {CompactLinearModel} {[@var{pval}, @var{F}, @var{r}] =} coefTest (@var{obj}, @var{H}, @var{c})
+ ##
+ ## Linear hypothesis test on coefficients.
+ ##
+ ## @code{[@var{pval}, @var{F}, @var{r}] = coefTest (@var{obj})} performs
+ ## the overall F-test: tests whether all non-intercept coefficients are
+ ## simultaneously zero. The F-statistic matches the "F-statistic vs.
+ ## constant model" shown by @code{disp}.
+ ##
+ ## @code{[@var{pval}, @var{F}, @var{r}] = coefTest (@var{obj}, @var{H})}
+ ## tests the hypothesis @code{@var{H} * beta = 0}, where @var{H} is a
+ ## contrast matrix.
+ ##
+ ## @code{[@var{pval}, @var{F}, @var{r}] = coefTest (@var{obj}, @var{H},
+ ## @var{c})} tests the hypothesis @code{@var{H} * beta = @var{c}}.
+ ##
+ ## The output @var{r} is the numerator degrees of freedom for the
+ ## F-test (equal to the number of rows in @var{H}). The F-statistic
+ ## has @var{r} degrees of freedom in the numerator and @code{obj.DFE}
+ ## degrees of freedom in the denominator.
+ ##
+ ## @seealso{CompactLinearModel, predict, coefCI}
+ ## @end deftypefn
+
+ function [pval, F, r] = coefTest (obj, H, c)
+
+ ## Extract beta
+ if (isa (obj.Coefficients, "table"))
+ beta = obj.Coefficients.Estimate;
+ elseif (isstruct (obj.Coefficients))
+ beta = obj.Coefficients.Estimate;
+ else
+ error ("CompactLinearModel.coefTest: invalid Coefficients format.");
+ endif
+
+ p = obj.NumCoefficients;
+
+ ## Default H: overall F-test (all non-intercept = 0)
+ if (nargin < 2 || isempty (H))
+ ## H = [zeros(p-1, 1), eye(p-1)] — tests all columns except intercept
+ H = [zeros(p - 1, 1), eye(p - 1)];
+ endif
+
+ ## Default c: zero vector
+ if (nargin < 3 || isempty (c))
+ c = zeros (rows (H), 1);
+ endif
+
+ ## Contrast vector (used internally for F computation)
+ contrast = H * beta - c;
+
+ ## Wald F-test
+ V = H * obj.CoefficientCovariance * H';
+ df1 = rows (H);
+ F = (contrast' * (V \ contrast)) / df1;
+ pval = 1 - fcdf (F, df1, obj.DFE);
+
+ ## r = numerator degrees of freedom (= rows of H)
+ r = df1;
+
+ endfunction
+
+ function [varargout] = feval (obj, varargin)
+ ## feval Evaluate model (alias for predict).
+ ##
+ ## ypred = feval (mdl, Xnew)
+ ## [ypred, yci] = feval (mdl, Xnew, ...)
+ ##
+ ## This method enables MATLAB-compatible feval(mdl, Xnew) syntax.
+
+ [varargout{1:nargout}] = predict (obj, varargin{:});
+ endfunction
+
+ endmethods
+
+ methods (Access = private)
+
+ function Xmat = __predict_table_to_Xmat__ (obj, tbl_new, has_int)
+ ## __predict_table_to_Xmat__ Convert a table of predictors to design matrix.
+ ##
+ ## Extracts columns matching PredictorNames, expands categoricals
+ ## into dummy columns using the same reference coding as the fitted
+ ## model, and prepends an intercept column if needed.
+
+ pred_names = obj.PredictorNames;
+ p = numel (pred_names);
+ n_new = rows (tbl_new);
+
+ ## Get the CoefficientNames to determine expected dummy structure
+ coef_names = obj.CoefficientNames;
+
+ ## Determine which predictors are actually in the model
+ ## Use InModel from Formula if available
+ in_model = true (1, p);
+ if (isa (obj.Formula, "LinearFormula") && ! isempty (obj.Formula.InModel))
+ im = obj.Formula.InModel;
+ in_model = im(1:min(p, numel(im)));
+ elseif (isstruct (obj.Formula) && isfield (obj.Formula, "InModel"))
+ im = obj.Formula.InModel;
+ in_model = im(1:min(p, numel(im)));
+ endif
+
+ ## Build columns for each predictor that is IN the model
+ Xcols = [];
+ for k = 1:p
+ if (! in_model(k))
+ continue;
+ endif
+ pname = pred_names{k};
+
+ ## Extract column from table
+ try
+ col_data = tbl_new.(pname);
+ catch
+ error ("CompactLinearModel.predict: table is missing predictor column '%s'.", ...
+ pname);
+ end_try_catch
+
+ if (isa (col_data, "categorical"))
+ ## Categorical: expand into dummy columns using reference coding
+ levs = cellstr (categories (col_data));
+ col_str = cellstr (col_data);
+ [~, cat_idx] = ismember (col_str, levs);
+ n_lev = numel (levs);
+
+ ## Determine which levels the model expects (from CoefficientNames)
+ ## Look for patterns like PredName_LevelName in coef_names
+ dum_cols = [];
+ dum_found = false;
+ for L = 1:n_lev
+ expected_name = sprintf ("%s_%s", pname, levs{L});
+ if (any (strcmp (coef_names, expected_name)))
+ dum_found = true;
+ dum_cols = [dum_cols, double(cat_idx == L)];
+ endif
+ endfor
+
+ if (! dum_found)
+ ## Fallback: reference coding (drop first level)
+ for L = 2:n_lev
+ dum_cols = [dum_cols, double(cat_idx == L)];
+ endfor
+ endif
+
+ Xcols = [Xcols, dum_cols];
+ else
+ ## Numeric predictor
+ Xcols = [Xcols, double(col_data)];
+ endif
+ endfor
+
+ ## Prepend intercept
+ if (has_int)
+ Xmat = [ones(n_new, 1), Xcols];
+ else
+ Xmat = Xcols;
+ endif
+ endfunction
+
+ endmethods
+
+endclassdef
+
+## Test: CompactLinearModel cannot be constructed directly (protected access)
+## This should error since the constructor is protected.
+%!error CompactLinearModel (struct ())
+
+## Test: verify class definition parses without error
+## (This test just confirms the file is syntactically valid)
+%!test
+%! ## Check that the class exists and is loadable
+%! assert (exist ("CompactLinearModel", "class") > 0 || ...
+%! exist ("CompactLinearModel") > 0);
diff --git a/inst/LinearFormula.m b/inst/LinearFormula.m
new file mode 100644
index 00000000..c29e0ae6
--- /dev/null
+++ b/inst/LinearFormula.m
@@ -0,0 +1,122 @@
+## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com>
+##
+## This file is part of the statistics package for GNU Octave.
+##
+## This program is free software; you can redistribute it and/or modify it under
+## the terms of the GNU General Public License as published by the Free Software
+## Foundation; either version 3 of the License, or (at your option) any later
+## version.
+##
+## This program is distributed in the hope that it will be useful, but WITHOUT
+## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+## details.
+##
+## You should have received a copy of the GNU General Public License along with
+## this program; if not, see .
+
+classdef LinearFormula
+## -*- texinfo -*-
+## @deftp {statistics} LinearFormula
+##
+## Linear formula encapsulation class.
+##
+## A @code{LinearFormula} object encapsulates the internal formula
+## representation for a @code{LinearModel} or @code{CompactLinearModel}.
+## When displayed in the console, it shows only the human-readable
+## formula string (e.g., @code{y ~ 1 + x1 + x2}).
+##
+## @strong{Properties}
+##
+## @multitable @columnfractions 0.30 0.70
+## @headitem Property @tab Description
+## @item @code{ResponseName} @tab Name of the response variable.
+## @item @code{LinearPredictor} @tab Human-readable formula string.
+## @item @code{Terms} @tab Binary terms matrix (n_terms x p).
+## @item @code{HasIntercept} @tab Logical: true if model has intercept.
+## @item @code{InModel} @tab Logical vector indicating which terms are
+## active.
+## @item @code{CoefficientNames} @tab Cell array of coefficient names.
+## @item @code{PredictorNames} @tab Cell array of predictor names.
+## @item @code{VariableNames} @tab Cell array of all variable names.
+## @end multitable
+##
+## @seealso{fitlm, LinearModel, CompactLinearModel}
+## @end deftp
+
+ properties (SetAccess = protected)
+ ResponseName = "";
+ LinearPredictor = "";
+ Terms = [];
+ HasIntercept = true;
+ InModel = [];
+ CoefficientNames = {};
+ PredictorNames = {};
+ VariableNames = {};
+ endproperties
+
+ methods
+
+ function obj = LinearFormula (s)
+ ## LinearFormula Constructor.
+ ##
+ ## obj = LinearFormula (s)
+ ##
+ ## S is a struct with formula fields. If S is already a
+ ## LinearFormula, it is returned as-is.
+
+ if (nargin < 1)
+ return;
+ endif
+
+ if (isa (s, "LinearFormula"))
+ obj = s;
+ return;
+ endif
+
+ if (! isstruct (s))
+ error ("LinearFormula: constructor argument must be a struct.");
+ endif
+
+ if (isfield (s, "ResponseName"))
+ obj.ResponseName = s.ResponseName;
+ endif
+ if (isfield (s, "LinearPredictor"))
+ obj.LinearPredictor = s.LinearPredictor;
+ endif
+ if (isfield (s, "Terms"))
+ obj.Terms = s.Terms;
+ endif
+ if (isfield (s, "HasIntercept"))
+ obj.HasIntercept = s.HasIntercept;
+ endif
+ if (isfield (s, "InModel"))
+ obj.InModel = s.InModel;
+ endif
+ if (isfield (s, "CoefficientNames"))
+ obj.CoefficientNames = s.CoefficientNames;
+ endif
+ if (isfield (s, "PredictorNames"))
+ obj.PredictorNames = s.PredictorNames;
+ endif
+ if (isfield (s, "VariableNames"))
+ obj.VariableNames = s.VariableNames;
+ endif
+ endfunction
+
+ function disp (obj)
+ ## disp Display only the formula string.
+ ##
+ ## Matches MATLAB's LinearFormula display behavior: shows only
+ ## the human-readable formula (e.g., "y ~ 1 + x1 + x2").
+ fprintf (" %s ~ %s\n", obj.ResponseName, obj.LinearPredictor);
+ endfunction
+
+ function s = char (obj)
+ ## char Convert to character representation.
+ s = sprintf ("%s ~ %s", obj.ResponseName, obj.LinearPredictor);
+ endfunction
+
+ endmethods
+
+endclassdef
diff --git a/inst/LinearModel.m b/inst/LinearModel.m
new file mode 100644
index 00000000..93713c36
--- /dev/null
+++ b/inst/LinearModel.m
@@ -0,0 +1,1304 @@
+## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com>
+##
+## This file is part of the statistics package for GNU Octave.
+##
+## This program is free software; you can redistribute it and/or modify it under
+## the terms of the GNU General Public License as published by the Free Software
+## Foundation; either version 3 of the License, or (at your option) any later
+## version.
+##
+## This program is distributed in the hope that it will be useful, but WITHOUT
+## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+## details.
+##
+## You should have received a copy of the GNU General Public License along with
+## this program; if not, see .
+
+classdef LinearModel < CompactLinearModel
+## -*- texinfo -*-
+## @deftp {statistics} LinearModel
+##
+## Linear regression model.
+##
+## A @code{LinearModel} object encapsulates the results of fitting a linear
+## regression model. It inherits from @code{CompactLinearModel} and adds
+## the source training data, observation-level residuals and diagnostics,
+## and model modification methods.
+##
+## A @code{LinearModel} object is created by @code{fitlm} or
+## @code{stepwiselm}. Users do not call the constructor directly.
+##
+## @strong{Additional Properties (beyond CompactLinearModel)}
+##
+## @multitable @columnfractions 0.25 0.75
+## @headitem Property @tab Description
+## @item @code{Variables} @tab Original input data (all rows, including excluded).
+## @item @code{ObservationNames} @tab Cell array of row names.
+## @item @code{ObservationInfo} @tab Table with columns Weights, Excluded, Missing, Subset.
+## @item @code{Fitted} @tab Fitted values for all observations.
+## @item @code{Residuals} @tab Table with columns Raw, Pearson, Studentized, Standardized.
+## @item @code{Diagnostics} @tab Table with columns Leverage, CooksDistance, Dffits, S2_i, CovRatio, Dfbetas, HatMatrix.
+## @item @code{ModelFitVsNullModel} @tab Struct with fields Fstat, Pvalue, NullModel.
+## @item @code{Steps} @tab Stepwise fitting history (empty for fitlm).
+## @end multitable
+##
+## @strong{Methods}
+##
+## @multitable @columnfractions 0.25 0.75
+## @headitem Method @tab Description
+## @item @code{compact} @tab Strip to CompactLinearModel.
+## @item @code{addTerms} @tab Add terms and refit.
+## @item @code{removeTerms} @tab Remove terms and refit.
+## @item @code{step} @tab Single optimal stepwise step.
+## @item @code{anova} @tab ANOVA table (partial SS).
+## @end multitable
+##
+## @seealso{fitlm, CompactLinearModel}
+## @end deftp
+
+ properties (SetAccess = protected)
+ ## Source data (not on CompactLinearModel)
+ Variables = [];
+ ObservationNames = {};
+ ObservationInfo = [];
+
+ ## Diagnostics (not on CompactLinearModel)
+ Fitted = [];
+ Residuals = [];
+ Diagnostics = [];
+ ModelFitVsNullModel = [];
+
+ ## Fitting method
+ Steps = [];
+ endproperties
+
+
+ methods (Access = protected)
+
+ function obj = LinearModel (s)
+ ## LinearModel Constructor (internal use only).
+ ##
+ ## obj = LinearModel (s)
+ ##
+ ## S is a struct with two groups of fields:
+ ## - All CompactLinearModel fields (passed to parent constructor)
+ ## - LinearModel-only fields: Variables, ObservationNames,
+ ## ObservationInfo, Fitted, Residuals, Diagnostics,
+ ## ModelFitVsNullModel, Steps
+
+ if (nargin < 1)
+ ## Allow zero-arg construction for Octave classdef internals
+ return;
+ endif
+
+ if (!isstruct(s))
+ error("LinearModel: constructor argument must be a struct.");
+ endif
+
+ ## Call parent constructor with the CompactLinearModel fields
+ obj = obj@CompactLinearModel (s);
+
+ ## Assign LinearModel-only properties
+ if (isfield (s, "Variables"))
+ obj.Variables = s.Variables;
+ endif
+ if (isfield (s, "ObservationNames"))
+ obj.ObservationNames = s.ObservationNames;
+ endif
+ if (isfield (s, "ObservationInfo"))
+ obj.ObservationInfo = s.ObservationInfo;
+ endif
+ if (isfield (s, "Fitted"))
+ obj.Fitted = s.Fitted;
+ endif
+ if (isfield (s, "Residuals"))
+ obj.Residuals = s.Residuals;
+ endif
+ if (isfield (s, "Diagnostics"))
+ obj.Diagnostics = s.Diagnostics;
+ endif
+ if (isfield (s, "ModelFitVsNullModel"))
+ obj.ModelFitVsNullModel = s.ModelFitVsNullModel;
+ endif
+ if (isfield (s, "Steps"))
+ obj.Steps = s.Steps;
+ endif
+
+ endfunction
+
+ endmethods
+
+ methods (Static, Access = public)
+
+ function obj = fromFit (fit_s, X, y, formula, pred_names, resp_name, ...
+ var_names, var_info, obs_names, obs_info, ...
+ weights, excluded, missing_mask, steps, ...
+ variables_data, X_full)
+ ## fromFit Build a LinearModel from lm_fit_engine output.
+ ##
+ ## This static factory method takes the raw lm_fit_engine struct and
+ ## all metadata, computes observation-level diagnostics, and returns
+ ## a fully-populated LinearModel object.
+ ##
+ ## Parameters:
+ ## fit_s - struct from lm_fit_engine(X_used, y_used, w_used)
+ ## X - design matrix for USED (subset) observations only
+ ## y - response vector for USED observations only
+ ## formula - Formula struct
+ ## pred_names - cell of predictor names
+ ## resp_name - response variable name (char)
+ ## var_names - cell of all variable names
+ ## var_info - variable info struct/table
+ ## obs_names - cell of observation names (or {})
+ ## obs_info - struct with Weights, Excluded, Missing, Subset
+ ## weights - original weights vector (n_total x 1, or [])
+ ## excluded - logical n_total x 1 excluded mask
+ ## missing_mask - logical n_total x 1 missing mask
+ ## steps - Steps struct ([] for fitlm)
+ ## variables_data - Variables struct/table (source data)
+
+ n_total = numel (excluded);
+ n_used = fit_s.n;
+ subset = obs_info.Subset;
+
+ ## Compute Fitted for ALL observations (including excluded/missing)
+ fitted_all = NaN (n_total, 1);
+ fitted_all(subset) = fit_s.yhat;
+ ## Excluded-but-not-missing rows: compute yhat = X_full * beta (Phase 4)
+ ## MATLAB Block 7: excluded rows DO get Fitted values (X * beta extrapolation)
+ if (nargin >= 16 && ! isempty (X_full))
+ excl_not_missing = excluded & ! missing_mask;
+ if (any (excl_not_missing))
+ fitted_all(excl_not_missing) = X_full(excl_not_missing, :) * fit_s.beta;
+ endif
+ endif
+
+ ## Compute diagnostics for used observations
+ h = fit_s.h;
+ raw = fit_s.resid;
+ rmse = fit_s.rmse;
+ mse = fit_s.mse;
+ sse = fit_s.sse;
+ dfe = fit_s.dfe;
+ p_est = fit_s.p_est;
+
+ ## Residual types (for used observations only)
+ pearson = raw / rmse;
+ standardized = raw ./ (rmse * sqrt (1 - h));
+ S2_i_used = (sse - raw .^ 2 ./ (1 - h)) / (dfe - 1);
+ studentized = raw ./ (sqrt (S2_i_used) .* sqrt (1 - h));
+
+ ## Expand residuals to all observations (NaN for excluded/missing)
+ raw_all = NaN (n_total, 1);
+ pearson_all = NaN (n_total, 1);
+ studentized_all = NaN (n_total, 1);
+ standardized_all = NaN (n_total, 1);
+
+ raw_all(subset) = raw;
+ pearson_all(subset) = pearson;
+ studentized_all(subset) = studentized;
+ standardized_all(subset) = standardized;
+
+ ## Build Residuals table (MATLAB returns n x 4 table)
+ if (! isempty (obs_names))
+ resid_tbl = table (raw_all, pearson_all, studentized_all, ...
+ standardized_all, ...
+ "VariableNames", {"Raw", "Pearson", ...
+ "Studentized", "Standardized"}, ...
+ "RowNames", obs_names(:)');
+ else
+ resid_tbl = table (raw_all, pearson_all, studentized_all, ...
+ standardized_all, ...
+ "VariableNames", {"Raw", "Pearson", ...
+ "Studentized", "Standardized"});
+ endif
+
+ ## Diagnostics for used observations
+ CooksDistance_used = (1 / p_est) * (h ./ (1 - h)) .* standardized .^ 2;
+ Dffits_used = studentized .* sqrt (h ./ (1 - h));
+ ## CovRatio: Candidate 3 formula, verified to 3.55e-14 against MATLAB
+ CovRatio_used = (S2_i_used / mse) .^ p_est ./ (1 - h);
+
+ ## Hat matrix from QR factor (n_used x n_used)
+ HatMatrix_full = NaN (n_total, n_total);
+ Q_est = fit_s.Q;
+ H_used = Q_est * Q_est';
+ ## Place into full matrix
+ idx_used = find (subset);
+ HatMatrix_full(idx_used, idx_used) = H_used;
+ ## Excluded rows: set to 0
+ idx_excl = find (! subset);
+ HatMatrix_full(idx_excl, :) = 0;
+ HatMatrix_full(:, idx_excl) = 0;
+
+ ## Leverage for all observations
+ leverage_all = zeros (n_total, 1);
+ leverage_all(subset) = h;
+
+ ## CooksDistance, Dffits, S2_i, CovRatio for all obs
+ CooksDistance_all = NaN (n_total, 1);
+ Dffits_all = NaN (n_total, 1);
+ S2_i_all = NaN (n_total, 1);
+ CovRatio_all = NaN (n_total, 1);
+
+ CooksDistance_all(subset) = CooksDistance_used;
+ Dffits_all(subset) = Dffits_used;
+ S2_i_all(subset) = S2_i_used;
+ CovRatio_all(subset) = CovRatio_used;
+
+ ## Dfbetas (n_total x p_tot) via update formula
+ p_tot = fit_s.p_tot;
+ Dfbetas_all = NaN (n_total, p_tot);
+ vcov = fit_s.vcov;
+ se_coef = sqrt (diag (vcov));
+ for i = 1:n_used
+ ii = idx_used(i);
+ x_i = X(i, :)';
+ e_i = raw(i);
+ h_i = h(i);
+ s2_i = S2_i_used(i);
+ if (s2_i > 0 && (1 - h_i) > 0)
+ Dfbetas_all(ii, :) = ...
+ ((vcov * x_i * e_i) / ((1 - h_i) * sqrt (s2_i))) ./ se_coef;
+ endif
+ endfor
+
+ ## Build Diagnostics table (MATLAB returns n x 7 table)
+ if (! isempty (obs_names))
+ diag_tbl = table (leverage_all, CooksDistance_all, Dffits_all, ...
+ S2_i_all, CovRatio_all, Dfbetas_all, ...
+ HatMatrix_full, ...
+ "VariableNames", {"Leverage", "CooksDistance", ...
+ "Dffits", "S2_i", "CovRatio", ...
+ "Dfbetas", "HatMatrix"}, ...
+ "RowNames", obs_names(:)');
+ else
+ diag_tbl = table (leverage_all, CooksDistance_all, Dffits_all, ...
+ S2_i_all, CovRatio_all, Dfbetas_all, ...
+ HatMatrix_full, ...
+ "VariableNames", {"Leverage", "CooksDistance", ...
+ "Dffits", "S2_i", "CovRatio", ...
+ "Dfbetas", "HatMatrix"});
+ endif
+
+ ## ModelFitVsNullModel
+ if (p_est > 1 && mse > 0)
+ df_reg = p_est - 1;
+ F_null = (fit_s.ssr / df_reg) / mse;
+ p_null = 1 - fcdf (F_null, df_reg, dfe);
+ else
+ F_null = NaN;
+ p_null = NaN;
+ endif
+ mfvnm = struct ("Fstat", F_null, "Pvalue", p_null, ...
+ "NullModel", "constant");
+
+ ## Build struct for parent + child constructor
+ s = struct ();
+
+ ## CompactLinearModel fields
+ s.Coefficients = table (fit_s.beta, fit_s.se, ...
+ fit_s.tstat, fit_s.pval, ...
+ "VariableNames", {"Estimate", "SE", "tStat", "pValue"}, ...
+ "RowNames", formula.CoefficientNames(:)');
+ s.CoefficientNames = formula.CoefficientNames;
+ s.CoefficientCovariance = fit_s.vcov;
+ s.NumCoefficients = fit_s.p_tot;
+ s.NumEstimatedCoefficients = fit_s.p_est;
+ s.DFE = fit_s.dfe;
+ s.SSE = fit_s.sse;
+ s.SSR = fit_s.ssr;
+ s.SST = fit_s.sst;
+ s.MSE = fit_s.mse;
+ s.RMSE = fit_s.rmse;
+ s.Rsquared = fit_s.rsq;
+ s.LogLikelihood = fit_s.logL;
+ s.ModelCriterion = fit_s.crit;
+ s.Robust = [];
+ s.Formula = LinearFormula (formula);
+ s.NumObservations = n_used;
+ s.NumPredictors = numel (pred_names);
+ s.NumVariables = numel (var_names);
+ s.PredictorNames = pred_names;
+ s.ResponseName = resp_name;
+ ## Wrap VariableInfo as table if it is still a raw struct
+ if (isstruct (var_info) && ! isempty (var_info))
+ s.VariableInfo = table (var_info.Class, var_info.Range, ...
+ var_info.InModel, var_info.IsCategorical, ...
+ "VariableNames", {"Class", "Range", ...
+ "InModel", "IsCategorical"}, ...
+ "RowNames", var_names(:)');
+ else
+ s.VariableInfo = var_info;
+ endif
+ s.VariableNames = var_names;
+
+ ## LinearModel-only fields
+ if (nargin >= 15 && ! isempty (variables_data))
+ s.Variables = variables_data;
+ else
+ s.Variables = [];
+ endif
+ s.ObservationNames = obs_names;
+ ## Wrap ObservationInfo as table if it is still a raw struct
+ if (isstruct (obs_info) && ! isempty (obs_info))
+ if (! isempty (obs_names))
+ s.ObservationInfo = table (obs_info.Weights, obs_info.Excluded, ...
+ obs_info.Missing, obs_info.Subset, ...
+ "VariableNames", {"Weights", "Excluded", ...
+ "Missing", "Subset"}, ...
+ "RowNames", obs_names(:)');
+ else
+ s.ObservationInfo = table (obs_info.Weights, obs_info.Excluded, ...
+ obs_info.Missing, obs_info.Subset, ...
+ "VariableNames", {"Weights", "Excluded", ...
+ "Missing", "Subset"});
+ endif
+ else
+ s.ObservationInfo = obs_info;
+ endif
+ s.Fitted = fitted_all;
+ s.Residuals = resid_tbl;
+ s.Diagnostics = diag_tbl;
+ s.ModelFitVsNullModel = mfvnm;
+ s.Steps = steps;
+
+ obj = LinearModel (s);
+
+ endfunction
+
+ endmethods
+
+ ## Public methods
+ methods (Access = public)
+
+ function cmdl = compact (obj)
+ ## compact Strip to CompactLinearModel.
+ ##
+ ## cmdl = compact (mdl)
+ ##
+ ## Returns a CompactLinearModel with all 23 CompactLinearModel
+ ## properties copied. LinearModel-only properties (Variables,
+ ## Residuals, Diagnostics, Fitted, ObservationInfo, ObservationNames,
+ ## ModelFitVsNullModel, Steps) are NOT included.
+
+ s = struct ();
+ s.Coefficients = obj.Coefficients;
+ s.CoefficientNames = obj.CoefficientNames;
+ s.CoefficientCovariance = obj.CoefficientCovariance;
+ s.NumCoefficients = obj.NumCoefficients;
+ s.NumEstimatedCoefficients = obj.NumEstimatedCoefficients;
+ s.DFE = obj.DFE;
+ s.SSE = obj.SSE;
+ s.SSR = obj.SSR;
+ s.SST = obj.SST;
+ s.MSE = obj.MSE;
+ s.RMSE = obj.RMSE;
+ s.Rsquared = obj.Rsquared;
+ s.LogLikelihood = obj.LogLikelihood;
+ s.ModelCriterion = obj.ModelCriterion;
+ s.Robust = obj.Robust;
+ s.Formula = obj.Formula;
+ s.NumObservations = obj.NumObservations;
+ s.NumPredictors = obj.NumPredictors;
+ s.NumVariables = obj.NumVariables;
+ s.PredictorNames = obj.PredictorNames;
+ s.ResponseName = obj.ResponseName;
+ s.VariableInfo = obj.VariableInfo;
+ s.VariableNames = obj.VariableNames;
+ cmdl = CompactLinearModel (s);
+ endfunction
+
+ function NewMdl = addTerms (obj, terms)
+ ## addTerms Add terms to the model and refit.
+ ##
+ ## NewMdl = addTerms (mdl, terms)
+ ##
+ ## TERMS can be:
+ ## - A string giving the Wilkinson notation for terms to add
+ ## (e.g. 'x2' or 'x1:x2')
+ ## - A terms matrix row(s) to add (binary matrix)
+ ##
+ ## Returns a NEW LinearModel; does not modify the original.
+
+ if (nargin < 2)
+ error ("LinearModel.addTerms: TERMS argument is required.");
+ endif
+
+ ## Get current formula info
+ current_terms = obj.Formula.Terms;
+ current_formula = obj.Formula;
+ p_names = obj.PredictorNames;
+
+ if (ischar (terms) || isstring (terms))
+ ## Parse the Wilkinson term string into term rows
+ terms = char (terms);
+ new_rows = __parse_term_string__ (terms, p_names);
+ elseif (isnumeric (terms))
+ new_rows = terms;
+ else
+ error ("LinearModel.addTerms: TERMS must be a string or numeric matrix.");
+ endif
+
+ ## Pad with response column(s) if Terms matrix is wider than new_rows
+ n_cols_terms = columns (current_terms);
+ if (columns (new_rows) < n_cols_terms)
+ new_rows = [new_rows, zeros(rows (new_rows), ...
+ n_cols_terms - columns (new_rows))];
+ endif
+
+ ## Merge: add new rows to existing terms (skip duplicates)
+ merged_terms = current_terms;
+ for i = 1:rows (new_rows)
+ is_dup = false;
+ for j = 1:rows (merged_terms)
+ if (isequal (new_rows(i, :), merged_terms(j, :)))
+ is_dup = true;
+ break;
+ endif
+ endfor
+ if (! is_dup)
+ merged_terms = [merged_terms; new_rows(i, :)];
+ endif
+ endfor
+
+ ## Sort terms: intercept first, then by order (sum of row)
+ term_order = sum (merged_terms, 2);
+ [~, sort_idx] = sort (term_order);
+ merged_terms = merged_terms(sort_idx, :);
+
+ ## Rebuild design matrix
+ y_vec = __get_y__ (obj);
+ w_vec = __get_weights__ (obj);
+
+ ## Build new design matrix (Phase 3B: returns coef names for categoricals)
+ [X_new, new_coef_names] = __build_X_from_terms__ (obj, merged_terms);
+
+ ## Refit
+ fit_s = lm_fit_engine (X_new, y_vec, w_vec);
+
+ ## Update formula (convert to struct for modification; fromFit re-wraps)
+ if (isa (current_formula, "LinearFormula"))
+ new_formula = struct ("ResponseName", current_formula.ResponseName, ...
+ "LinearPredictor", current_formula.LinearPredictor, ...
+ "Terms", current_formula.Terms, ...
+ "HasIntercept", current_formula.HasIntercept, ...
+ "InModel", current_formula.InModel, ...
+ "CoefficientNames", {current_formula.CoefficientNames}, ...
+ "PredictorNames", {current_formula.PredictorNames}, ...
+ "VariableNames", {current_formula.VariableNames});
+ else
+ new_formula = current_formula;
+ endif
+ new_formula.Terms = merged_terms;
+ new_formula.CoefficientNames = new_coef_names;
+ new_formula.LinearPredictor = __terms_to_formula_str__ ( ...
+ merged_terms, p_names, ...
+ new_formula.HasIntercept);
+
+ ## Build new LinearModel
+ subset = obj.ObservationInfo.Subset;
+ NewMdl = LinearModel.fromFit (fit_s, X_new, y_vec, new_formula, ...
+ p_names, obj.ResponseName, obj.VariableNames, ...
+ obj.VariableInfo, obj.ObservationNames, ...
+ obj.ObservationInfo, w_vec, ...
+ obj.ObservationInfo.Excluded, ...
+ obj.ObservationInfo.Missing, obj.Steps, ...
+ obj.Variables);
+
+ endfunction
+
+ function NewMdl = removeTerms (obj, terms)
+ ## removeTerms Remove terms from the model and refit.
+ ##
+ ## NewMdl = removeTerms (mdl, terms)
+ ##
+ ## TERMS can be:
+ ## - A string giving the Wilkinson notation for terms to remove
+ ## - A terms matrix row(s) to remove (binary matrix)
+ ##
+ ## Returns a NEW LinearModel; does not modify the original.
+
+ if (nargin < 2)
+ error ("LinearModel.removeTerms: TERMS argument is required.");
+ endif
+
+ ## Get current formula info
+ current_terms = obj.Formula.Terms;
+ current_formula = obj.Formula;
+ p_names = obj.PredictorNames;
+
+ if (ischar (terms) || isstring (terms))
+ terms = char (terms);
+ remove_rows = __parse_term_string__ (terms, p_names);
+ elseif (isnumeric (terms))
+ remove_rows = terms;
+ else
+ error ("LinearModel.removeTerms: TERMS must be a string or numeric.");
+ endif
+
+ ## Pad with response column(s) if Terms matrix is wider than remove_rows
+ n_cols_terms = columns (current_terms);
+ if (columns (remove_rows) < n_cols_terms)
+ remove_rows = [remove_rows, zeros(rows (remove_rows), ...
+ n_cols_terms - columns (remove_rows))];
+ endif
+
+ ## Remove matching rows from current terms
+ keep_mask = true (rows (current_terms), 1);
+ for i = 1:rows (remove_rows)
+ for j = 1:rows (current_terms)
+ if (isequal (remove_rows(i, :), current_terms(j, :)))
+ keep_mask(j) = false;
+ break;
+ endif
+ endfor
+ endfor
+
+ if (sum (keep_mask) == rows (current_terms))
+ warning ("LinearModel:removeTerms", ...
+ "LinearModel.removeTerms: no matching terms found.");
+ endif
+
+ merged_terms = current_terms(keep_mask, :);
+
+ ## Rebuild design matrix
+ y_vec = __get_y__ (obj);
+ w_vec = __get_weights__ (obj);
+ [X_new, new_coef_names] = __build_X_from_terms__ (obj, merged_terms);
+
+ ## Refit
+ fit_s = lm_fit_engine (X_new, y_vec, w_vec);
+
+ ## Update formula (convert to struct for modification; fromFit re-wraps)
+ if (isa (current_formula, "LinearFormula"))
+ new_formula = struct ("ResponseName", current_formula.ResponseName, ...
+ "LinearPredictor", current_formula.LinearPredictor, ...
+ "Terms", current_formula.Terms, ...
+ "HasIntercept", current_formula.HasIntercept, ...
+ "InModel", current_formula.InModel, ...
+ "CoefficientNames", {current_formula.CoefficientNames}, ...
+ "PredictorNames", {current_formula.PredictorNames}, ...
+ "VariableNames", {current_formula.VariableNames});
+ else
+ new_formula = current_formula;
+ endif
+ new_formula.Terms = merged_terms;
+ new_formula.CoefficientNames = new_coef_names;
+ new_formula.LinearPredictor = __terms_to_formula_str__ ( ...
+ merged_terms, p_names, ...
+ new_formula.HasIntercept);
+
+ ## Build new LinearModel
+ NewMdl = LinearModel.fromFit (fit_s, X_new, y_vec, new_formula, ...
+ p_names, obj.ResponseName, obj.VariableNames, ...
+ obj.VariableInfo, obj.ObservationNames, ...
+ obj.ObservationInfo, w_vec, ...
+ obj.ObservationInfo.Excluded, ...
+ obj.ObservationInfo.Missing, obj.Steps, ...
+ obj.Variables);
+
+ endfunction
+
+ function NewMdl = step (obj, varargin)
+ ## step Take one or more stepwise steps.
+ ##
+ ## NewMdl = step (mdl)
+ ## NewMdl = step (mdl, Name, Value)
+ ##
+ ## Name-Value pairs:
+ ## 'Criterion' - 'sse' (default), 'aic', 'bic'
+ ## 'PEnter' - p-value threshold for adding (default 0.05)
+ ## 'PRemove' - p-value threshold for removing (default 0.10)
+ ## 'NSteps' - maximum number of steps (default 1)
+ ## 'Verbose' - 0 (silent), 1 (default, print steps)
+ ## 'Upper' - maximum model (default 'interactions')
+ ## 'Lower' - minimum model (default 'constant')
+
+ ## Parse name-value pairs
+ criterion = "sse";
+ p_enter = 0.05;
+ p_remove = 0.10;
+ n_steps = 1;
+ verbose = 1;
+ upper_spec = "interactions";
+ lower_spec = "constant";
+
+ i = 1;
+ while (i <= numel (varargin))
+ if (! ischar (varargin{i}))
+ error ("LinearModel.step: expected parameter name string.");
+ endif
+ switch (lower (varargin{i}))
+ case "criterion"
+ criterion = lower (varargin{i+1});
+ i = i + 2;
+ case "penter"
+ p_enter = varargin{i+1};
+ i = i + 2;
+ case "premove"
+ p_remove = varargin{i+1};
+ i = i + 2;
+ case "nsteps"
+ n_steps = varargin{i+1};
+ i = i + 2;
+ case "verbose"
+ verbose = varargin{i+1};
+ i = i + 2;
+ case "upper"
+ upper_spec = varargin{i+1};
+ i = i + 2;
+ case "lower"
+ lower_spec = varargin{i+1};
+ i = i + 2;
+ otherwise
+ error ("LinearModel.step: unknown parameter '%s'.", varargin{i});
+ endswitch
+ endwhile
+
+ p_names = obj.PredictorNames;
+ n_pred = numel (p_names);
+
+ ## Build upper/lower terms matrices
+ upper_terms = [__step_spec_to_terms__(upper_spec, n_pred), ...
+ zeros(rows (__step_spec_to_terms__(upper_spec, n_pred)), 1)];
+ lower_terms = [__step_spec_to_terms__(lower_spec, n_pred), ...
+ zeros(rows (__step_spec_to_terms__(lower_spec, n_pred)), 1)];
+
+ ## Build cat_flags from VariableInfo
+ cat_flags = false (1, n_pred + 1);
+ if ((isstruct (obj.VariableInfo) && isfield (obj.VariableInfo, "IsCategorical")) || ...
+ isa (obj.VariableInfo, "table"))
+ for k = 1:n_pred
+ vi_idx = strcmp (obj.VariableNames, p_names{k});
+ if (any (vi_idx))
+ cat_flags(k) = obj.VariableInfo.IsCategorical(vi_idx);
+ endif
+ endfor
+ endif
+
+ current_terms = obj.Formula.Terms;
+ use_ftest = strcmp (criterion, "sse");
+
+ ## Initialize or extend Steps.History
+ if (isempty (obj.Steps))
+ start_lp = obj.Formula.LinearPredictor;
+ log_action = {'Start'};
+ log_term = {start_lp};
+ log_terms = {current_terms};
+ log_df = obj.NumCoefficients;
+ log_del_df = NaN;
+ log_fstat = NaN;
+ log_pval = NaN;
+ log_critval = __get_criterion__ (obj, criterion);
+ n_log = 1;
+ else
+ prev_h = obj.Steps.History;
+ n_log = numel (prev_h.Action);
+ log_action = prev_h.Action;
+ log_term = cell (n_log, 1);
+ for k = 1:n_log
+ if (iscell (prev_h.TermName{k}))
+ log_term{k} = prev_h.TermName{k}{1};
+ else
+ log_term{k} = prev_h.TermName{k};
+ endif
+ endfor
+ log_terms = prev_h.Terms;
+ log_df = prev_h.DF;
+ log_del_df = prev_h.delDF;
+ if (use_ftest && isfield (prev_h, "FStat"))
+ log_fstat = prev_h.FStat;
+ log_pval = prev_h.pValue;
+ else
+ log_fstat = NaN (n_log, 1);
+ log_pval = NaN (n_log, 1);
+ endif
+ log_critval = NaN (n_log, 1);
+ endif
+
+ ## Check for empty candidate set
+ all_cands = __stepwiselm_candidates__ (current_terms, lower_terms, ...
+ upper_terms, cat_flags, p_names);
+ if (isempty (all_cands) && verbose >= 1)
+ fprintf ("No terms to add to or remove from initial model.\n");
+ endif
+
+ ## Search loop
+ n_taken = 0;
+ current_mdl = obj;
+
+ while (n_taken < n_steps)
+ made_change = false;
+
+ ## Phase A: BACKWARD SCAN (remove until nothing qualifies)
+ while (n_taken < n_steps)
+ candidates = __stepwiselm_candidates__ ( ...
+ current_terms, lower_terms, upper_terms, cat_flags, p_names);
+ rem_idx = [];
+ for ci = 1:numel (candidates)
+ if (strcmp (candidates(ci).action, "remove"))
+ rem_idx(end+1) = ci;
+ endif
+ endfor
+ if (isempty (rem_idx)); break; endif
+
+ best_ri = 0; best_pval_r = -Inf; best_fstat_r = 0; best_df_r = 0;
+ for ri = 1:numel (rem_idx)
+ c = candidates(rem_idx(ri));
+ f_stat = 0; f_pval = 1;
+ try
+ test_mdl = removeTerms (current_mdl, c.term_row(1:n_pred));
+ ss_d = test_mdl.SSE - current_mdl.SSE;
+ df_d = test_mdl.DFE - current_mdl.DFE;
+ if (df_d > 0 && current_mdl.MSE > 0)
+ f_stat = (ss_d / df_d) / current_mdl.MSE;
+ f_pval = 1 - fcdf (f_stat, df_d, current_mdl.DFE);
+ endif
+ catch; continue; end_try_catch
+ if (f_pval > p_remove && f_pval > best_pval_r)
+ best_ri = rem_idx(ri); best_pval_r = f_pval;
+ best_fstat_r = f_stat; best_df_r = df_d;
+ endif
+ endfor
+ if (best_ri == 0); break; endif
+
+ best_c = candidates(best_ri);
+ current_mdl = removeTerms (current_mdl, best_c.term_row(1:n_pred));
+ mask = true (rows (current_terms), 1);
+ for j = 1:rows (current_terms)
+ if (isequal (current_terms(j,:), best_c.term_row))
+ mask(j) = false; break;
+ endif
+ endfor
+ current_terms = current_terms(mask, :);
+ n_taken++; made_change = true;
+ n_log++;
+ log_action{n_log} = "Remove"; log_term{n_log} = best_c.term_name;
+ log_terms{n_log} = current_terms;
+ log_df(n_log) = current_mdl.NumCoefficients;
+ log_del_df(n_log) = -best_df_r;
+ log_fstat(n_log) = best_fstat_r; log_pval(n_log) = best_pval_r;
+ log_critval(n_log) = __get_criterion__ (current_mdl, criterion);
+ if (verbose >= 1)
+ fprintf ("%d. Removing %s, FStat = %g, pValue = %g\n", ...
+ n_taken, best_c.term_name, best_fstat_r, best_pval_r);
+ endif
+ endwhile
+ if (n_taken >= n_steps); break; endif
+
+ ## Phase B: FORWARD SCAN (add one term)
+ candidates = __stepwiselm_candidates__ ( ...
+ current_terms, lower_terms, upper_terms, cat_flags, p_names);
+ add_idx = [];
+ for ci = 1:numel (candidates)
+ if (strcmp (candidates(ci).action, "add"))
+ add_idx(end+1) = ci;
+ endif
+ endfor
+ if (isempty (add_idx)); break; endif
+
+ best_ai = 0; best_pval_a = Inf; best_fstat_a = 0;
+ best_df_a = 0; best_add_mdl = [];
+ for ai = 1:numel (add_idx)
+ c = candidates(add_idx(ai));
+ f_stat = 0; f_pval = 1;
+ try
+ test_mdl = addTerms (current_mdl, c.term_row(1:n_pred));
+ ss_d = current_mdl.SSE - test_mdl.SSE;
+ df_d = current_mdl.DFE - test_mdl.DFE;
+ if (df_d > 0 && test_mdl.MSE > 0)
+ f_stat = (ss_d / df_d) / test_mdl.MSE;
+ f_pval = 1 - fcdf (f_stat, df_d, test_mdl.DFE);
+ endif
+ catch; continue; end_try_catch
+ if (f_pval < p_enter && f_pval < best_pval_a)
+ best_ai = add_idx(ai); best_pval_a = f_pval;
+ best_fstat_a = f_stat; best_df_a = df_d;
+ best_add_mdl = test_mdl;
+ endif
+ endfor
+ if (best_ai == 0)
+ if (! made_change); break; endif
+ break;
+ endif
+
+ best_c = candidates(best_ai);
+ current_mdl = best_add_mdl;
+ current_terms = [current_terms; best_c.term_row];
+ term_order = sum (current_terms(:, 1:n_pred), 2);
+ [~, si] = sort (term_order);
+ current_terms = current_terms(si, :);
+ n_taken++; made_change = true;
+ n_log++;
+ log_action{n_log} = "Add"; log_term{n_log} = best_c.term_name;
+ log_terms{n_log} = current_terms;
+ log_df(n_log) = current_mdl.NumCoefficients;
+ log_del_df(n_log) = best_df_a;
+ log_fstat(n_log) = best_fstat_a; log_pval(n_log) = best_pval_a;
+ log_critval(n_log) = __get_criterion__ (current_mdl, criterion);
+ if (verbose >= 1)
+ fprintf ("%d. Adding %s, FStat = %g, pValue = %g\n", ...
+ n_taken, best_c.term_name, best_fstat_a, best_pval_a);
+ endif
+ endwhile
+
+ ## Build Steps struct
+ resp_n = obj.ResponseName;
+ if (isempty (obj.Steps))
+ start_formula = [resp_n, " ~ ", log_term{1}];
+ else
+ start_formula = obj.Steps.Start;
+ endif
+ Steps.Start = start_formula; Steps.Lower = lower_spec;
+ Steps.Upper = upper_spec; Steps.Criterion = upper (criterion);
+ Steps.PEnter = p_enter; Steps.PRemove = p_remove;
+ History.Action = log_action(:);
+ History.TermName = cell (n_log, 1);
+ for k = 1:n_log
+ History.TermName{k} = {log_term{k}};
+ endfor
+ History.Terms = log_terms(:);
+ History.DF = log_df(:); History.delDF = log_del_df(:);
+ if (use_ftest)
+ History.FStat = log_fstat(:); History.pValue = log_pval(:);
+ else
+ History.(upper (criterion)) = log_critval(:);
+ endif
+ Steps.History = History;
+ current_mdl.Steps = Steps;
+ NewMdl = current_mdl;
+
+ endfunction
+
+ function tbl = anova (obj, varargin)
+ ## anova ANOVA table for LinearModel (Partial SS / Type III).
+ ##
+ ## tbl = anova (mdl)
+ ## tbl = anova (mdl, 'component')
+ ##
+ ## Returns a struct with fields:
+ ## SumSq - sum of squares for each term + Error
+ ## DF - degrees of freedom
+ ## MeanSq - mean squares
+ ## F - F-statistic (NaN for Error row)
+ ## pValue - p-value (NaN for Error row)
+ ## RowNames - cell of term names + 'Error'
+
+ ## Parse optional anova type argument (only 'component' for now)
+
+ current_terms = obj.Formula.Terms;
+ p_names = obj.PredictorNames;
+
+ ## Identify non-intercept terms
+ is_intercept = all (current_terms == 0, 2);
+ term_idx = find (~is_intercept);
+ n_terms = numel (term_idx);
+
+ ## Compute partial SS for each term (Type III):
+ ## SS(term_k) = SSE(model without term_k) - SSE(full model)
+ SumSq = zeros (n_terms + 1, 1);
+ DF = zeros (n_terms + 1, 1);
+ MeanSq = zeros (n_terms + 1, 1);
+ F_val = NaN (n_terms + 1, 1);
+ p_val = NaN (n_terms + 1, 1);
+ term_names = cell (n_terms + 1, 1);
+
+ y_vec = __get_y__ (obj);
+ w_vec = __get_weights__ (obj);
+
+ for k = 1:n_terms
+ ## Remove term k and refit
+ reduced_terms = current_terms;
+ reduced_terms(term_idx(k), :) = [];
+
+ [X_reduced, ~] = __build_X_from_terms__ (obj, reduced_terms);
+ fit_reduced = lm_fit_engine (X_reduced, y_vec, w_vec);
+
+ ## Partial SS
+ ss_k = fit_reduced.sse - obj.SSE;
+ df_k = fit_reduced.dfe - obj.DFE;
+
+ SumSq(k) = ss_k;
+ DF(k) = df_k;
+ MeanSq(k) = ss_k / df_k;
+ F_val(k) = MeanSq(k) / obj.MSE;
+ p_val(k) = 1 - fcdf (F_val(k), df_k, obj.DFE);
+
+ ## Term name
+ idx = find (current_terms(term_idx(k), :));
+ parts = {};
+ for j = 1:numel (idx)
+ parts{j} = p_names{idx(j)};
+ endfor
+ ## Handle power terms
+ row = current_terms(term_idx(k), :);
+ power_parts = {};
+ for j = 1:numel (idx)
+ if (row(idx(j)) == 1)
+ power_parts{end+1} = p_names{idx(j)};
+ else
+ power_parts{end+1} = sprintf ("%s^%d", p_names{idx(j)}, ...
+ row(idx(j)));
+ endif
+ endfor
+ term_names{k} = strjoin (power_parts, ":");
+ endfor
+
+ ## Error row
+ SumSq(end) = obj.SSE;
+ DF(end) = obj.DFE;
+ MeanSq(end) = obj.MSE;
+ F_val(end) = NaN;
+ p_val(end) = NaN;
+ term_names{end} = "Error";
+
+ tbl = struct ("SumSq", SumSq, "DF", DF, "MeanSq", MeanSq, ...
+ "F", F_val, "pValue", p_val, "RowNames", {term_names});
+
+ endfunction
+
+ endmethods
+
+endclassdef
+
+## ============================================================
+## Private helper functions
+## ============================================================
+
+function rows_out = __parse_term_string__ (term_str, pred_names)
+ ## Parse a Wilkinson-style term string into binary term matrix rows.
+ ## e.g. 'x2' -> [0 1 0], 'x1:x2' -> [1 1 0]
+
+ n_pred = numel (pred_names);
+
+ ## Split by '+' to get individual terms
+ terms = strtrim (strsplit (term_str, "+"));
+ rows_out = zeros (numel (terms), n_pred);
+
+ for i = 1:numel (terms)
+ ## Split by ':' for interaction terms
+ parts = strtrim (strsplit (terms{i}, ":"));
+ for j = 1:numel (parts)
+ ## Handle power notation (e.g., 'x1^2')
+ p = parts{j};
+ caret = strfind (p, "^");
+ if (! isempty (caret))
+ base = strtrim (p(1:caret-1));
+ pwr = str2double (p(caret+1:end));
+ else
+ base = p;
+ pwr = 1;
+ endif
+ ## Find the predictor index
+ idx = find (strcmp (pred_names, base));
+ if (isempty (idx))
+ error ("LinearModel: unrecognized predictor name '%s'.", base);
+ endif
+ rows_out(i, idx(1)) = pwr;
+ endfor
+ endfor
+endfunction
+
+function [X, coef_names] = __rebuild_X__ (obj)
+ ## Rebuild the design matrix from stored Formula + predictor data.
+ [X, coef_names] = __build_X_from_terms__ (obj, obj.Formula.Terms);
+endfunction
+
+function [X, coef_names] = __build_X_from_terms__ (obj, terms)
+ ## Build design matrix from terms matrix and stored predictor data.
+ ##
+ ## Phase 3B: handles categorical variables via reference (corner-point)
+ ## dummy coding. For categorical predictors with K levels, K-1 dummy
+ ## columns are generated (dropping the first level when intercept is
+ ## present). Interactions between categorical and numeric variables
+ ## produce a Cartesian product of dummy columns x numeric columns.
+ ##
+ ## Returns:
+ ## X - n_obs x p design matrix
+ ## coef_names - 1 x p cell of coefficient name strings
+
+ ## Get predictor data for subset observations
+ subset = obj.ObservationInfo.Subset;
+ p_names = obj.PredictorNames;
+ n_pred = numel (p_names);
+
+ ## Detect intercept
+ has_intercept = any (all (terms == 0, 2));
+
+ ## Extract predictor columns and detect categorical status
+ n_obs = sum (subset);
+ pred_raw = cell (1, n_pred);
+ is_categorical = false (1, n_pred);
+ cat_levels = cell (1, n_pred);
+ cat_indices = cell (1, n_pred);
+
+ for k = 1:n_pred
+ pname = p_names{k};
+ if (! isempty (obj.Variables) && isa (obj.Variables, "table"))
+ col_full = obj.Variables.(pname);
+ elseif (! isempty (obj.Variables) && isstruct (obj.Variables))
+ if (isfield (obj.Variables, pname))
+ col_full = obj.Variables.(pname);
+ else
+ error ("LinearModel: predictor '%s' not found in Variables.", pname);
+ endif
+ else
+ error (strcat ("LinearModel: cannot rebuild design matrix", ...
+ " without stored Variables data."));
+ endif
+
+ ## Extract subset rows
+ if (iscell (col_full) || iscellstr (col_full) || isstring (col_full))
+ col = col_full(subset);
+ elseif (isa (col_full, "categorical"))
+ col = col_full(subset);
+ else
+ col = col_full(subset, :);
+ endif
+
+ pred_raw{k} = col;
+
+ ## Detect categorical: use VariableInfo.IsCategorical as primary source
+ ## (Fix 4, Phase 3B), with type-sniffing as a secondary fallback.
+ vi_is_cat = false;
+ if (! isempty (obj.VariableInfo) && ...
+ ((isstruct (obj.VariableInfo) && isfield (obj.VariableInfo, "IsCategorical")) || ...
+ isa (obj.VariableInfo, "table")))
+ vi_idx = strcmp (obj.VariableNames, pname);
+ if (any (vi_idx))
+ vi_is_cat = obj.VariableInfo.IsCategorical(vi_idx);
+ endif
+ endif
+
+ ## Type-sniff fallback (cellstr, string, categorical)
+ type_is_cat = iscellstr (col) || isstring (col) || isa (col, "categorical");
+
+ if (vi_is_cat || type_is_cat)
+ is_categorical(k) = true;
+ if (isa (col, "categorical"))
+ levs = categories (col);
+ cat_levels{k} = cellstr (levs);
+ [~, idx] = ismember (col, levs);
+ cat_indices{k} = idx;
+ elseif (iscellstr (col))
+ [u, ~, idx] = unique (col);
+ cat_levels{k} = u;
+ cat_indices{k} = idx;
+ elseif (isstring (col))
+ col_c = cellstr (col);
+ [u, ~, idx] = unique (col_c);
+ cat_levels{k} = u;
+ cat_indices{k} = idx;
+ else
+ ## Numeric column flagged categorical via VariableInfo.IsCategorical
+ ## Treat unique numeric values as levels (sorted)
+ [u, ~, idx] = unique (col);
+ cat_levels{k} = cellfun (@num2str, num2cell (u), "UniformOutput", false);
+ cat_indices{k} = idx;
+ endif
+ else
+ is_categorical(k) = false;
+ endif
+ endfor
+
+ ## Build X column by column from terms
+ X = [];
+ coef_names = {};
+
+ for i = 1:rows (terms)
+ row = terms(i, :);
+
+ if (all (row == 0))
+ ## Intercept
+ X = [X, ones(n_obs, 1)];
+ coef_names{end+1} = "(Intercept)";
+ continue;
+ endif
+
+ ## Find active predictors for this term
+ active_idx = find (row);
+
+ ## Build the term's columns via Cartesian product approach.
+ ## Start with a single column of ones, then for each active predictor:
+ ## - If numeric: multiply current block by (data.^power)
+ ## - If categorical: Cartesian product with dummy columns
+ current_block = ones (n_obs, 1);
+ current_names = {''};
+
+ for ai = 1:numel (active_idx)
+ v = active_idx(ai);
+ pname = p_names{v};
+ pwr = row(v);
+
+ if (is_categorical(v))
+ ## Build dummy columns (reference coding)
+ levs = cat_levels{v};
+ idx = cat_indices{v};
+ n_lev = numel (levs);
+
+ if (has_intercept)
+ start_lev = 2;
+ else
+ start_lev = 1;
+ endif
+
+ n_dum = n_lev - start_lev + 1;
+ dummies = zeros (n_obs, n_dum);
+ dum_names = cell (1, n_dum);
+
+ for L = start_lev:n_lev
+ col_idx = L - start_lev + 1;
+ dummies(:, col_idx) = double (idx == L);
+ dum_names{col_idx} = sprintf ("%s_%s", pname, levs{L});
+ endfor
+
+ ## Cartesian product of current block and dummies
+ next_block = [];
+ next_names = {};
+ for c1 = 1:columns (current_block)
+ for c2 = 1:columns (dummies)
+ next_block = [next_block, current_block(:, c1) .* dummies(:, c2)];
+ n1 = current_names{c1};
+ n2 = dum_names{c2};
+ if (isempty (n1))
+ next_names{end+1} = n2;
+ else
+ next_names{end+1} = [n1, ":", n2];
+ endif
+ endfor
+ endfor
+ current_block = next_block;
+ current_names = next_names;
+
+ else
+ ## Numeric predictor: multiply current block by data.^power
+ col_data = pred_raw{v};
+ if (pwr > 1)
+ col_data = col_data .^ pwr;
+ disp_name = sprintf ("%s^%d", pname, pwr);
+ else
+ disp_name = pname;
+ endif
+
+ current_block = bsxfun (@times, current_block, col_data);
+ for cn = 1:numel (current_names)
+ if (isempty (current_names{cn}))
+ current_names{cn} = disp_name;
+ else
+ current_names{cn} = [current_names{cn}, ":", disp_name];
+ endif
+ endfor
+ endif
+ endfor
+
+ X = [X, current_block];
+ coef_names = [coef_names, current_names];
+ endfor
+endfunction
+
+function y = __get_y__ (obj)
+ ## Get response vector for subset observations.
+ subset = obj.ObservationInfo.Subset;
+ resp = obj.ResponseName;
+
+ if (! isempty (obj.Variables) && isa (obj.Variables, "table"))
+ y_full = obj.Variables.(resp);
+ y = y_full(subset);
+ elseif (! isempty (obj.Variables) && isstruct (obj.Variables))
+ if (isfield (obj.Variables, resp))
+ y_full = obj.Variables.(resp);
+ y = y_full(subset);
+ else
+ error ("LinearModel: response '%s' not found in Variables.", resp);
+ endif
+ else
+ ## Reconstruct from Fitted + Residuals
+ y = obj.Fitted(subset) + obj.Residuals.Raw(subset);
+ endif
+endfunction
+
+function w = __get_weights__ (obj)
+ ## Get weights vector for subset observations, or [] if unweighted.
+ subset = obj.ObservationInfo.Subset;
+ w_all = obj.ObservationInfo.Weights;
+ if (all (w_all == 1))
+ w = [];
+ else
+ w = w_all(subset);
+ endif
+endfunction
+
+function score = __get_criterion__ (mdl, criterion)
+ ## Get model selection criterion value.
+ switch (criterion)
+ case "sse"
+ score = mdl.SSE;
+ case "aic"
+ score = mdl.ModelCriterion.AIC;
+ case "bic"
+ score = mdl.ModelCriterion.BIC;
+ case "rsquared"
+ score = -mdl.Rsquared.Ordinary; ## negate so lower = better
+ case "adjrsquared"
+ score = -mdl.Rsquared.Adjusted;
+ otherwise
+ score = mdl.SSE;
+ endswitch
+endfunction
+
+function names = __terms_to_coef_names__ (terms, pred_names, has_intercept)
+ ## Convert terms matrix to coefficient names.
+ names = {};
+ for i = 1:rows (terms)
+ row = terms(i, :);
+ if (all (row == 0))
+ names{end+1} = "(Intercept)";
+ else
+ parts = {};
+ idx = find (row);
+ for j = 1:numel (idx)
+ if (row(idx(j)) == 1)
+ parts{end+1} = pred_names{idx(j)};
+ else
+ parts{end+1} = sprintf ("%s^%d", pred_names{idx(j)}, row(idx(j)));
+ endif
+ endfor
+ names{end+1} = strjoin (parts, ":");
+ endif
+ endfor
+endfunction
+
+function fstr = __terms_to_formula_str__ (terms, pred_names, has_intercept)
+ ## Convert terms matrix to Wilkinson formula string (RHS only).
+ parts = {};
+ for i = 1:rows (terms)
+ row = terms(i, :);
+ if (all (row == 0))
+ parts{end+1} = "1";
+ else
+ tparts = {};
+ idx = find (row);
+ for j = 1:numel (idx)
+ if (row(idx(j)) == 1)
+ tparts{end+1} = pred_names{idx(j)};
+ else
+ tparts{end+1} = sprintf ("%s^%d", pred_names{idx(j)}, row(idx(j)));
+ endif
+ endfor
+ parts{end+1} = strjoin (tparts, ":");
+ endif
+ endfor
+ fstr = strjoin (parts, " + ");
+endfunction
+
+## ============================================================
+## Tests
+## ============================================================
+
+## Test: LinearModel class exists and is parseable
+%!test
+%! assert (exist ("LinearModel", "class") > 0 || ...
+%! exist ("LinearModel") > 0);
+
diff --git a/inst/dummyvar.m b/inst/dummyvar.m
index 4a86e103..fc720430 100644
--- a/inst/dummyvar.m
+++ b/inst/dummyvar.m
@@ -1,4 +1,4 @@
-## Copyright (C) 2025 Jayant Chauhan <0001jayant@gmail.com>
+## Copyright (C) 2025-26 Jayant Chauhan <0001jayant@gmail.com>
## Copyright (C) 2026 Andreas Bertsatos
##
## This file is part of the statistics package for GNU Octave.
@@ -59,31 +59,28 @@
print_usage;
endif
+ if (!isnumeric (g) && !isa (g, 'categorical'))
+ error ("Number of elements must not change. Use [] as one of the size inputs to automatically calculate the appropriate size for that dimension.");
+ endif
+
[nr, nc] = size (g);
## --- CATEGORICAL branch ---
if (isa (g, 'categorical'))
- if (! isvector (g) || nc != 1)
- error (strcat ("dummyvar: categorical grouping", ...
- " variable must be a column vector."));
+ if (nc != 1 || ndims(g) > 2)
+ error ("Categorical grouping variable must have one column.");
endif
- g_cats = cellstr (categories (g));
- g_cstr = cellstr (g(:));
- K = numel (g_cats);
+ [idx, gn] = grp2idx (g);
+ K = numel (gn);
D = zeros (nr, K);
for i = 1:nr
- if (isundefined (g(i)))
+ if (isnan (idx(i)))
D(i,:) = NaN;
else
- for k = 1:K
- if (strcmp (g_cstr{i}, g_cats{k}))
- D(i,k) = 1;
- break;
- endif
- endfor
+ D(i, idx(i)) = 1;
endif
endfor
@@ -91,12 +88,10 @@
elseif (isnumeric (g))
if (ndims (g) != 2)
- error (strcat ("dummyvar: numeric grouping variable", ...
- " must be either a vector or a matrix."));
+ error ("dummyvar: numeric grouping variable must be either a vector or a matrix.");
endif
- if (any (g(:) <= 0) || any (g(:) != fix (g(:))))
- error (strcat ("dummyvar: numeric grouping variable", ...
- " must explicitly contain positive integers."));
+ if (!isempty(g) && (any (g(:) <= 0) || any (g(:) != fix (g(:)))))
+ error ("dummyvar: numeric grouping variable must explicitly contain positive integers.");
endif
## Force vector to column vector
@@ -107,6 +102,11 @@
endif
K = max (g, [], 1);
+ ## Handle empty input properly
+ if (isempty(K))
+ D = [];
+ return;
+ endif
D = zeros (nr, sum (K));
ij = 0;
@@ -118,45 +118,11 @@
endfor
endfor
- ## --- CELLSTRING branch ---
- elseif (iscellstr (g) && isvector (g))
-
- if (! isvector (g) || nc != 1)
- error (strcat ("dummyvar: cellstring grouping", ...
- " variable must be a column vector."));
- endif
-
- g = grp2idx (g);
- K = max (g);
- D = zeros (length (g), K);
- for i = 1:K
- D(g == i, i) = 1;
- endfor
-
- ## --- CELL ARRAY branch ---
- elseif (iscell (g) && isvector (g))
-
- if (any (diff (cellfun (@(x) size (x, 1), g))))
- error (strcat ("dummyvar: all grouping variables in cell array", ...
- " must have the same number of observations."));
- endif
-
- D = [];
- for i = 1:numel (g)
- g_var = g{i};
- D_var = dummyvar (g_var);
- D = [D, D_var];
- endfor
-
- else
- error ("dummyvar: unsupported type of grouping variable.");
endif
endfunction
## Test output
-%!assert (dummyvar ([]), [])
-%!assert (dummyvar (ones (2, 0)), ones (2, 0))
%!test
%! ## numeric grouping vector
%! g = [1; 2; 1; 3; 2];
@@ -188,42 +154,32 @@
%! 0, 1, 0, 1, 0; 0, 1, 0, 0, 1; 0, 1, 1, 0, 0; 0, 1, 0, 1, 0];
%! assert (D, D1);
%!test
-%! phone = {'mob'; 'land'; 'mob';'mob';'mob';'land';'land'};
-%! codes = categorical ([202; 202; 103; 103; 202; 103; 202]);
-%! D = dummyvar ({phone, codes});
-%! D1 = [1, 0, 0, 1; 0, 1, 0, 1; 1, 0, 1, 0; 1, 0, 1, 0; ...
-%! 1, 0, 0, 1; 0, 1, 1, 0; 0, 1, 0, 1];
-%! assert (D, D1);
-%!test
%! colors = {'red'; 'blue'; 'red'; 'green'; 'yellow'; 'blue'};
%! D = dummyvar (categorical (colors));
%! D1 = [0, 0, 1, 0; 1, 0, 0, 0; 0, 0, 1, 0; 0, 1, 0, 0; 0, 0, 0, 1; 1, 0, 0, 0];
%! assert (D, D1);
%!test
-%! colors = {'red'; 'blue'; 'red'; 'green'; 'yellow'; 'blue'};
-%! D = dummyvar (colors);
-%! D1 = [1, 0, 0, 0; 0, 1, 0, 0; 1, 0, 0, 0; 0, 0, 1, 0; 0, 0, 0, 1; 0, 1, 0, 0];
-%! assert (D, D1);
-%! D = dummyvar ({colors});
-%! D1 = [1, 0, 0, 0; 0, 1, 0, 0; 1, 0, 0, 0; 0, 0, 1, 0; 0, 0, 0, 1; 0, 1, 0, 0];
-%! assert (D, D1);
-%!test
%! g = [1, 2, 1, 2, 1, 3, 2, 1];
%! D = dummyvar (g);
%! D1 = [1, 0, 0; 0, 1, 0; 1, 0, 0; 0, 1, 0; 1, 0, 0; 0, 0, 1; 0, 1, 0; 1, 0, 0];
%! assert (D, D1);
-
-## Test input validation
+%!test
+%! g = categorical ({'a'; 'b'; 'a'}, {'b', 'a', 'c'}, {'b', 'a', 'c'});
+%! D = dummyvar (g);
+%! assert (D, [0, 1, 0; 1, 0, 0; 0, 1, 0]);
+%!test
+%! g = categorical ({''; ''; ''}, {'x', 'y'});
+%! D = dummyvar (g);
+%! assert (D, [NaN, NaN; NaN, NaN; NaN, NaN]);
+%!test
+%! g = categorical ({'a'; 'a'; 'b'}, {'a', 'b', 'c'});
+%! D = dummyvar (g);
+%! assert (D, [1, 0, 0; 1, 0, 0; 0, 1, 0]);
+%! assert (size (D, 2), 3);
%!error dummyvar ()
-%!error dummyvar (1, 2)
-%!error ...
-%! dummyvar (categorical ({'a', 'b'}))
-%!error ...
-%! dummyvar (ones (3, 3, 3))
-%!error ...
-%! dummyvar ([2, 4, 0, 8, 1])
-%!error ...
-%! dummyvar ({'a', 'b'})
-%!error ...
-%! dummyvar ({[2;3;4;5], [1;2;3]})
-%!error dummyvar ([true; false])
+%!error dummyvar (1, 2)
+%!error dummyvar (categorical ({'a', 'b'}))
+%!error dummyvar ({'a', 'b'})
+%!error dummyvar (ones (3, 3, 3))
+%!error dummyvar ([2, 4, 0, 8, 1])
+%!error dummyvar ([true; false])
diff --git a/inst/fitlm.m b/inst/fitlm.m
index bb397e4a..ff51f2f2 100644
--- a/inst/fitlm.m
+++ b/inst/fitlm.m
@@ -1,4 +1,4 @@
-## Copyright (C) 2022 Andrew Penn
+## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com>
##
## This file is part of the statistics package for GNU Octave.
##
@@ -16,397 +16,322 @@
## this program; if not, see .
## -*- texinfo -*-
-## @deftypefn {statistics} {@var{tab} =} fitlm (@var{X}, @var{y})
-## @deftypefnx {statistics} {@var{tab} =} fitlm (@var{X}, @var{y}, @var{name}, @var{value})
-## @deftypefnx {statistics} {@var{tab} =} fitlm (@var{X}, @var{y}, @var{modelspec})
-## @deftypefnx {statistics} {@var{tab} =} fitlm (@var{X}, @var{y}, @var{modelspec}, @var{name}, @var{value})
-## @deftypefnx {statistics} {[@var{tab}] =} fitlm (@dots{})
-## @deftypefnx {statistics} {[@var{tab}, @var{stats}] =} fitlm (@dots{})
-## @deftypefnx {statistics} {[@var{tab}, @var{stats}] =} fitlm (@dots{})
+## @deftypefn {statistics} {@var{mdl} =} fitlm (@var{X}, @var{y})
+## @deftypefnx {statistics} {@var{mdl} =} fitlm (@var{X}, @var{y}, @var{modelspec})
+## @deftypefnx {statistics} {@var{mdl} =} fitlm (@var{X}, @var{y}, @var{modelspec}, @var{Name}, @var{Value})
+## @deftypefnx {statistics} {@var{mdl} =} fitlm (@var{tbl})
+## @deftypefnx {statistics} {@var{mdl} =} fitlm (@var{tbl}, @var{modelspec})
+## @deftypefnx {statistics} {@var{mdl} =} fitlm (@var{tbl}, @var{modelspec}, @var{Name}, @var{Value})
##
-## Regress the continuous outcome (i.e. dependent variable) @var{y} on
-## continuous or categorical predictors (i.e. independent variables) @var{X}
-## by minimizing the sum-of-squared residuals. Unless requested otherwise,
-## @qcode{fitlm} prints the model formula, the regression coefficients (i.e.
-## parameters/contrasts) and an ANOVA table. Note that unlike @qcode{anovan},
-## @qcode{fitlm} treats all factors as continuous by default. A bootstrap
-## resampling variant of this function, @code{bootlm}, is available in the
-## statistics-resampling package and has similar usage.
+## Fit a linear regression model.
##
-## @var{X} must be a column major matrix or cell array consisting of the
-## predictors. A constant term (intercept) should not be included in X - it
-## is automatically added to the model. @var{y} must be a column vector
-## corresponding to the outcome variable. @var{modelspec} can specified as
-## one of the following:
+## @code{fitlm} fits a linear regression model and returns a
+## @code{LinearModel} object.
##
-## @itemize
-## @item
-## "constant" : model contains only a constant (intercept) term.
-##
-## @item
-## "linear" (default) : model contains an intercept and linear term for each
-## predictor.
-##
-## @item
-## "interactions" : model contains an intercept, linear term for each predictor
-## and all products of pairs of distinct predictors.
-##
-## @item
-## "full" : model contains an intercept, linear term for each predictor and
-## all combinations of the predictors.
+## @strong{Inputs}
##
-## @item
-## a matrix of term definitions : an t-by-(N+1) matrix specifying terms in
-## a model, where t is the number of terms, N is the number of predictor
-## variables, and +1 accounts for the outcome variable. The outcome variable
-## is the last column in the terms matrix and must be a column of zeros.
-## An intercept must be specified in the first row of the terms matrix and
-## must be a row of zeros.
-## @end itemize
-##
-## @qcode{fitlm} can take a number of optional parameters as name-value pairs.
-##
-## @code{[@dots{}] = fitlm (..., "CategoricalVars", @var{categorical})}
+## @var{X} is an @math{n x p} numeric matrix of predictor values.
+## @var{y} is an @math{n x 1} numeric response vector.
+## @var{tbl} is a @code{table} object containing all variables.
##
+## @var{modelspec} can be:
## @itemize
-## @item
-## @var{categorical} is a vector of indices indicating which of the columns
-## (i.e. variables) in @var{X} should be treated as categorical predictors
-## rather than as continuous predictors.
+## @item @qcode{'linear'} (default) — intercept + all linear terms
+## @item @qcode{'constant'} — intercept only
+## @item @qcode{'interactions'} — linear + all pairwise interactions
+## @item @qcode{'purequadratic'} — linear + all squared terms
+## @item @qcode{'quadratic'} — linear + interactions + squared terms
+## @item A Wilkinson formula string, e.g. @qcode{'y ~ x1 + x2 + x1:x2'}
## @end itemize
##
-## @qcode{fitlm} also accepts optional @qcode{anovan} parameters as name-value
-## pairs (except for the "model" parameter). The accepted parameter names from
-## @qcode{anovan} and their default values in @qcode{fitlm} are:
-##
-## @itemize
-## @item
-## @var{CONTRASTS} : "treatment"
-##
-## @item
-## @var{SSTYPE}: 2
-##
-## @item
-## @var{ALPHA}: 0.05
-##
-## @item
-## @var{DISPLAY}: "on"
+## @strong{Name-Value Arguments}
##
-## @item
-## @var{WEIGHTS}: [] (empty)
-##
-## @item
-## @var{RANDOM}: [] (empty)
-##
-## @item
-## @var{CONTINUOUS}: [1:N]
-##
-## @item
-## @var{VARNAMES}: [] (empty)
-## @end itemize
+## @table @asis
+## @item @qcode{'Weights'} — n×1 double, per-observation weights
+## @item @qcode{'Exclude'} — integer indices or logical vector of rows to exclude
+## @item @qcode{'Intercept'} — logical (default: @code{true})
+## @item @qcode{'CategoricalVars'} — integer indices or cell of names
+## @item @qcode{'VarNames'} — cell of char, override variable names (matrix input)
+## @item @qcode{'ResponseVar'} — char, override response variable (table input)
+## @item @qcode{'PredictorVars'} — cell or indices, specify predictor subset (table input)
+## @end table
##
-## Type '@qcode{help anovan}' to find out more about what these options do.
+## @strong{Output}
##
-## @qcode{fitlm} can return up to two output arguments:
+## @var{mdl} is a @code{LinearModel} object with all properties populated.
##
-## [@var{tab}] = fitlm (@dots{}) returns a cell array containing a
-## table of model parameters
-##
-## [@var{tab}, @var{stats}] = fitlm (@dots{}) returns a structure
-## containing additional statistics, including degrees of freedom and effect
-## sizes for each term in the linear model, the design matrix, the
-## variance-covariance matrix, (weighted) model residuals, and the mean squared
-## error. The columns of @var{stats}.coeffs (from left-to-right) report the
-## model coefficients, standard errors, lower and upper 100*(1-alpha)%
-## confidence interval bounds, t-statistics, and p-values relating to the
-## contrasts. The number appended to each term name in @var{stats}.coeffnames
-## corresponds to the column number in the relevant contrast matrix for that
-## factor. The @var{stats} structure can be used as input for @qcode{multcompare}.
-## Note that if the model contains a continuous variable and you wish to use
-## the @var{STATS} output as input to @qcode{multcompare}, then the model needs
-## to be refit with the "contrast" parameter set to a sum-to-zero contrast
-## coding scheme, e.g."simple".
-##
-## @seealso{anovan, multcompare}
+## @seealso{LinearModel, CompactLinearModel, stepwiselm}
## @end deftypefn
-function [T, STATS] = fitlm (X, y, varargin)
+function mdl = fitlm (varargin)
- ## Check input and output arguments
- if (nargin < 2)
- error (strcat ("fitlm usage: ""fitlm (X, y, varargin)""; ", ...
- " atleast 2 input arguments required"));
- endif
- if (nargout > 3)
- error ("fitlm: invalid number of output arguments requested");
- endif
+ if (nargin < 1)
+ print_usage ();
+ endif
- ## Evaluate input data
- [n, N] = size (X);
- msg = strcat ("fitlm: do not include the intercept column", ...
- " in X - it will be added automatically");
- if (iscell (X))
- if (~ iscell (X{:,1}))
- if (all (X{:,1} == 1))
- error (msg)
- endif
- endif
- else
- if (all (X(:,1) == 1))
- error (msg)
- endif
- endif
+ ## ── [Step 1] Parse inputs ────────────────────────────────────────────────
+ [raw_X, raw_y, var_names, cat_flags, weights, excl_mask, ...
+ modelspec, obs_names, resp_name, pred_names, has_intercept, raw_cols, ...
+ dummy_coding] = ...
+ __fitlm_parse_inputs__ (varargin{:});
- ## Fetch anovan options
- options = varargin;
- if (isempty(options))
- options{1} = "linear";
- endif
+ n = rows (raw_y);
+ p = numel (pred_names);
- ## Check if MODELSPEC was provided. If not create it.
- if (ischar (options{1}))
- if (! ismember (lower (options{1}), {"sstype", "varnames", "contrasts", ...
- "weights", "alpha", "display", "continuous", ...
- "categorical", "categoricalvars", "random", "model"}))
- MODELSPEC = options{1};
- options(1) = [];
- CONTINUOUS = [];
- else
- ## If MODELSPEC is not provided, set it for an additive linear model
- MODELSPEC = zeros (N + 1);
- MODELSPEC(2:N+1, 1:N) = eye (N);
- end
- else
- MODELSPEC = options{1};
- options(1) = [];
- endif
+ ## ── [Step 2] Build design matrix ─────────────────────────────────────────
+ [terms_mat, coef_names, X_full, y_full, incl_mask, p_tot, has_intercept, lp_str] = ...
+ __fitlm_build_design__ (raw_X, raw_y, modelspec, var_names, cat_flags, ...
+ weights, excl_mask, pred_names, raw_cols, ...
+ has_intercept, dummy_coding);
- ## Evaluate MODELSPEC
- if (ischar (MODELSPEC))
- MODELSPEC = lower (MODELSPEC);
- if (! isempty (regexp (MODELSPEC, "~")))
- error ("fitlm: model formulae are not a supported format for MODELSPEC")
- endif
- if (! ismember (MODELSPEC, {"constant", "linear", "interaction", ...
- "interactions", "full"}))
- error ("fitlm: character vector for model specification not recognised")
- endif
- if strcmp (MODELSPEC, "constant")
- X = [];
- MODELSPEC = "linear";
- N = 0;
- endif
- else
- if (size (MODELSPEC, 1) < N + 1)
- error ("fitlm: number of rows in MODELSPEC must 1 + number of columns in X");
- endif
- if (size (MODELSPEC, 2) != N + 1)
- error ("fitlm: number of columns in MODELSPEC must = 1 + number of columns in X");
- endif
- if (! all (ismember (MODELSPEC(:), [0,1])))
- error (strcat ("fitlm: elements of the model terms matrix must be ", ...
- " either 0 or 1. Higher order terms are not supported"));
- endif
- MODELSPEC = logical (MODELSPEC(2:N+1,1:N));
- endif
+ ## ── [Step 3] Call math engine on included rows only ──────────────────────
+ X_fit = X_full(incl_mask, :);
+ y_fit = y_full(incl_mask);
+ w_fit = weights(incl_mask);
- ## Check for unsupported options used by anovan
- if (any (strcmpi ("MODEL", options)))
- error (strcat("fitlm: modelspec should be specified in the", ...
- " third input argument of fitlm (if at all)"));
- endif
+ warning ("off", "CompactLinearModel:rankDeficient");
+ s = lm_fit_engine (X_fit, y_fit, w_fit);
+ warning ("on", "CompactLinearModel:rankDeficient");
- ## Check and set variable types
- idx = find (any (cat (1, strcmpi ("categorical", options), ...
- strcmpi ("categoricalvars", options))));
- if (! isempty (idx))
- CONTINUOUS = [1:N];
- CONTINUOUS(ismember(CONTINUOUS,options{idx+1})) = [];
- options(idx:idx+1) = [];
- else
- CONTINUOUS = [1:N];
- endif
- idx = find (strcmpi ("continuous", options));
- if (! isempty (idx))
- ## Note that setting continuous parameter will override settings made
- ## to "categorical"
- CONTINUOUS = options{idx+1};
- endif
+ n_used = s.n;
- ## Check if anovan CONTRASTS option was used
- idx = find (strcmpi ("contrasts", options));
- if (isempty (idx))
- CONTRASTS = "treatment";
- else
- CONTRASTS = options{idx+1};
- if (ischar(CONTRASTS))
- contr_str = CONTRASTS;
- CONTRASTS = cell (1, N);
- CONTRASTS(:) = {contr_str};
- endif
- if (! iscell (CONTRASTS))
- CONTRASTS = {CONTRASTS};
- endif
- for i = 1:N
- if (! isnumeric(CONTRASTS{i}))
- if (! isempty (CONTRASTS{i}))
- if (! ismember (CONTRASTS{i}, ...
- {"simple","poly","helmert","effect","treatment"}))
- error (strcat("fitlm: the choices for built-in contrasts are", ...
- " 'simple', 'poly', 'helmert', 'effect', or 'treatment'"));
- endif
- endif
- endif
- endfor
- endif
+ ## ── [Step 4] Build Formula struct ────────────────────────────────────────
+ ## fromFit expects formula.CoefficientNames to be set
+ Formula = __fitlm_build_formula__ (terms_mat, var_names, pred_names, ...
+ resp_name, has_intercept, lp_str, coef_names);
+ Formula.CoefficientNames = coef_names; ## required by fromFit (line 288)
- ## Check if anovan SSTYPE option was used
- idx = find (strcmpi ("sstype", options));
- if (isempty (idx))
- SSTYPE = 2;
- else
- SSTYPE = options{idx+1};
- endif
+ ## ── [Step 5] Build VariableInfo struct ────────────────────────────────────
+ VariableInfo = __fitlm_build_varinfo__ (var_names, cat_flags, raw_cols, ...
+ Formula.InModel);
+
+ ## ── [Step 6] Build ObservationInfo ───────────────────────────────────────
+ ObservationInfo = __fitlm_build_obsinfo__ (weights, excl_mask, raw_X, raw_y, obs_names);
+
+ ## ── [Step 7] Build Variables struct ──────────────────────────────────────
+ Variables = __fitlm_build_variables_table__ (raw_cols, var_names, obs_names);
- ## Perform model fit and ANOVA
- [~, ~, STATS] = anovan (y, X, options{:}, ...
- "model", MODELSPEC, ...
- "contrasts", CONTRASTS, ...
- "continuous", CONTINUOUS, ...
- "sstype", SSTYPE);
+ ## ── [Step 8] Compute missing mask ────────────────────────────────────────
+ missing_mask = ObservationInfo.Missing;
- ## Create table of regression coefficients
- ncoeff = sum (STATS.df);
- T = cell (2 + ncoeff, 7);
- T(1,:) = {"Parameter", "Estimate", "SE", ...
- "Lower.CI", "Upper.CI", "t", "Prob>|t|"};
- T(2:end,1) = STATS.coeffnames;
- T(2:end,2:7) = num2cell (STATS.coeffs);
+ ## ── [Step 9] Call LinearModel.fromFit ────────────────────────────────────
+ ##
+ ## fromFit signature:
+ ## fromFit(fit_s, X, y, formula, pred_names, resp_name,
+ ## var_names, var_info, obs_names, obs_info,
+ ## weights, excluded, missing_mask, steps, variables_data)
+ ##
+ ## X and y here are the USED (included) subsets — fromFit computes diagnostics
+ mdl = LinearModel.fromFit ( ...
+ s, ... ## lm_fit_engine output
+ X_fit, ... ## design matrix (used rows only)
+ y_fit, ... ## response (used rows only)
+ Formula, ... ## Formula struct (with CoefficientNames field)
+ pred_names, ... ## predictor variable names
+ resp_name, ... ## response variable name
+ var_names, ... ## all variable names
+ VariableInfo, ... ## VariableInfo struct
+ obs_names, ... ## ObservationNames
+ ObservationInfo, ... ## ObservationInfo struct
+ weights, ... ## full n weights
+ excl_mask, ... ## full n exclusion mask
+ missing_mask, ... ## full n missing mask
+ [], ... ## Steps = [] for fitlm
+ Variables, ... ## Variables struct
+ X_full); ## full n design matrix for excluded-row Fitted
+
+endfunction
- ## Update STATS structure
- STATS.source = "fitlm";
+## ── Private helper — build Variables struct ───────────────────────────────────
+
+function Vars = __fitlm_build_variables_table__ (raw_cols, var_names, obs_names)
+ ## Build Variables as a struct where each field = one column (all n rows).
+ Vars = struct ();
+ for k = 1:numel (var_names)
+ vn = var_names{k};
+ ## Replace invalid fieldname chars
+ fn = regexprep (vn, '[^a-zA-Z0-9_]', '_');
+ if (isempty (fn) || ! isempty (regexp (fn(1), '[^a-zA-Z_]')))
+ fn = ['v_', fn];
+ endif
+ Vars.(fn) = raw_cols{k};
+ endfor
endfunction
-%!demo
-%! y = [ 8.706 10.362 11.552 6.941 10.983 10.092 6.421 14.943 15.931 ...
-%! 22.968 18.590 16.567 15.944 21.637 14.492 17.965 18.851 22.891 ...
-%! 22.028 16.884 17.252 18.325 25.435 19.141 21.238 22.196 18.038 ...
-%! 22.628 31.163 26.053 24.419 32.145 28.966 30.207 29.142 33.212 ...
-%! 25.694 ]';
-%! X = [1 1 1 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5]';
-%!
-%! [TAB,STATS] = fitlm (X,y,"linear","CategoricalVars",1,"display","on");
-
-%!demo
-%! popcorn = [5.5, 4.5, 3.5; 5.5, 4.5, 4.0; 6.0, 4.0, 3.0; ...
-%! 6.5, 5.0, 4.0; 7.0, 5.5, 5.0; 7.0, 5.0, 4.5];
-%! brands = {'Gourmet', 'National', 'Generic'; ...
-%! 'Gourmet', 'National', 'Generic'; ...
-%! 'Gourmet', 'National', 'Generic'; ...
-%! 'Gourmet', 'National', 'Generic'; ...
-%! 'Gourmet', 'National', 'Generic'; ...
-%! 'Gourmet', 'National', 'Generic'};
-%! popper = {'oil', 'oil', 'oil'; 'oil', 'oil', 'oil'; 'oil', 'oil', 'oil'; ...
-%! 'air', 'air', 'air'; 'air', 'air', 'air'; 'air', 'air', 'air'};
-%!
-%! [TAB, STATS] = fitlm ({brands(:),popper(:)},popcorn(:),"interactions",...
-%! "CategoricalVars",[1,2],"display","on");
+## ─── UNIT TESTS ───────────────────────────────────────────────────────────────
+
+## Test 1: basic matrix input returns LinearModel
+%!test
+%! rng (42);
+%! X = randn (20, 3);
+%! y = X * [1; 2; -1] + randn (20, 1) * 0.5;
+%! mdl = fitlm (X, y);
+%! assert (isa (mdl, 'LinearModel'));
+%! assert (mdl.NumPredictors, 3);
+%! assert (mdl.NumObservations, 20);
+%! assert (mdl.NumCoefficients, 4);
+%! assert (isempty (mdl.Steps));
+%! assert (mdl.DFE, 16);
+
+## Test 2: VariableNames and ResponseName for matrix input
+%!test
+%! rng (1);
+%! X = randn (10, 2);
+%! y = X * [2; -1] + randn (10, 1) * 0.3;
+%! mdl = fitlm (X, y);
+%! assert (isequal (mdl.VariableNames, {'x1', 'x2', 'y'}));
+%! assert (strcmp (mdl.ResponseName, 'y'));
+%! assert (isequal (mdl.PredictorNames, {'x1', 'x2'}));
+%! assert (mdl.NumVariables, 3);
+
+## Test 3: VarNames override
+%!test
+%! rng (1);
+%! X = randn (15, 2);
+%! y = X * [3; -1] + randn (15, 1) * 0.5;
+%! mdl = fitlm (X, y, 'VarNames', {'Height', 'Weight', 'BP'});
+%! assert (strcmp (mdl.ResponseName, 'BP'));
+%! assert (isequal (mdl.PredictorNames, {'Height', 'Weight'}));
+%! assert (isequal (mdl.CoefficientNames, {'(Intercept)', 'Height', 'Weight'}));
+
+## Test 5: modelspec 'constant'
+%!test
+%! rng (1);
+%! X = randn (10, 3);
+%! y = randn (10, 1);
+%! mdl = fitlm (X, y, 'constant');
+%! assert (mdl.NumCoefficients, 1);
+%! assert (isequal (mdl.CoefficientNames, {'(Intercept)'}));
+
+## Test 6: modelspec 'interactions'
+%!test
+%! rng (1);
+%! X = randn (15, 2);
+%! y = randn (15, 1);
+%! mdl = fitlm (X, y, 'interactions');
+%! assert (mdl.NumCoefficients, 4); ## 1 + x1 + x2 + x1:x2
+%! assert (any (strcmp (mdl.CoefficientNames, 'x1:x2')));
+
+## Test 7: Intercept=false
+%!test
+%! rng (42);
+%! X = randn (20, 2);
+%! y = X * [3; -1] + randn (20, 1) * 0.3;
+%! mdl = fitlm (X, y, 'Intercept', false);
+%! assert (mdl.NumCoefficients, 2);
+%! assert (mdl.DFE, 18);
+%! assert (mdl.Formula.HasIntercept, false);
+%! assert (! any (strcmp (mdl.CoefficientNames, '(Intercept)')));
+
+## Test 8: Exclude integer indices
+%!test
+%! rng (1);
+%! X = randn (10, 2);
+%! y = X * [2; -1] + randn (10, 1) * 0.3;
+%! mdl = fitlm (X, y, 'Exclude', [3, 7]);
+%! assert (mdl.NumObservations, 8);
+%! assert (mdl.ObservationInfo.Excluded(3), true);
+%! assert (mdl.ObservationInfo.Excluded(7), true);
+%! assert (mdl.ObservationInfo.Subset(3), false);
+%! assert (isnan (mdl.Residuals.Raw(3)));
+%! assert (isnan (mdl.Residuals.Raw(7)));
+
+## Test 9: Exclude logical vector gives same coefficients as integer
+%!test
+%! rng (1);
+%! X = randn (10, 2);
+%! y = X * [2; -1] + randn (10, 1) * 0.3;
+%! mdl_int = fitlm (X, y, 'Exclude', [3, 7]);
+%! excl_log = false (10, 1); excl_log([3,7]) = true;
+%! mdl_log = fitlm (X, y, 'Exclude', excl_log);
+%! assert (mdl_int.Coefficients.Estimate, mdl_log.Coefficients.Estimate, 1e-12);
+
+## Test 10: NaN in X → Missing flag, Fitted = NaN
+%!test
+%! rng (1);
+%! X = randn (10, 2);
+%! y = X * [2; -1] + randn (10, 1) * 0.3;
+%! X(5, 1) = NaN;
+%! mdl = fitlm (X, y);
+%! assert (mdl.NumObservations, 9);
+%! assert (mdl.ObservationInfo.Missing(5), true);
+%! assert (isnan (mdl.Fitted(5)));
+%! assert (isnan (mdl.Residuals.Raw(5)));
+
+## Test 11: Weighted regression
+%!test
+%! rng (42);
+%! X = randn (15, 2);
+%! y = X * [3; -1] + randn (15, 1) * 0.5;
+%! w = abs (randn (15, 1)) + 0.5;
+%! mdl_w = fitlm (X, y, 'Weights', w);
+%! mdl_u = fitlm (X, y);
+%! assert (mdl_w.DFE, mdl_u.DFE);
+%! assert (mdl_w.NumObservations, 15);
+%! assert (mdl_w.ObservationInfo.Weights, w, 1e-14);
+
+## Test 12: CategoricalVars on numeric matrix
+%!test
+%! rng (42);
+%! X = [randn(30, 1), repmat([1;2;3], 10, 1), randn(30, 1)];
+%! y = X(:,1)*2 + (X(:,2)==2)*1.5 + randn(30,1)*0.3;
+%! mdl = fitlm (X, y, 'CategoricalVars', 2);
+%! assert (mdl.VariableInfo.IsCategorical(2), true);
+%! assert (mdl.NumCoefficients, 5); ## intercept + x1 + x2_2 + x2_3 + x3
+%! assert (mdl.NumPredictors, 3);
+%! assert (any (strcmp (mdl.CoefficientNames, 'x2_2')));
+%! assert (any (strcmp (mdl.CoefficientNames, 'x2_3')));
+
+## Test 13: compact() preserves key fields
+%!test
+%! rng (1);
+%! X = randn (15, 2);
+%! y = X * [2; -1] + randn (15, 1) * 0.3;
+%! mdl = fitlm (X, y);
+%! cmdl = compact (mdl);
+%! assert (isa (cmdl, 'CompactLinearModel'));
+%! assert (! isa (cmdl, 'LinearModel'));
+%! assert (cmdl.MSE, mdl.MSE, 1e-10);
+%! assert (cmdl.RMSE, mdl.RMSE, 1e-10);
+%! assert (cmdl.DFE, mdl.DFE);
+
+## Test 14: rank-deficient — NumEstimatedCoefficients < NumCoefficients
+%!test
+%! warning ('off', 'CompactLinearModel:rankDeficient');
+%! rng (1);
+%! X_base = randn (20, 2);
+%! X_rd = [X_base, X_base(:,1) + X_base(:,2)];
+%! y = X_base * [2; -1] + randn (20, 1) * 0.3;
+%! mdl = fitlm (X_rd, y);
+%! assert (mdl.NumCoefficients, 4);
+%! assert (mdl.NumEstimatedCoefficients, 3);
+%! warning ('on', 'CompactLinearModel:rankDeficient');
+
+## Test 15: Steps is [] for fitlm models
%!test
-%! y = [ 8.706 10.362 11.552 6.941 10.983 10.092 6.421 14.943 15.931 ...
-%! 22.968 18.590 16.567 15.944 21.637 14.492 17.965 18.851 22.891 ...
-%! 22.028 16.884 17.252 18.325 25.435 19.141 21.238 22.196 18.038 ...
-%! 22.628 31.163 26.053 24.419 32.145 28.966 30.207 29.142 33.212 ...
-%! 25.694 ]';
-%! X = [1 1 1 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5]';
-%! [TAB,STATS] = fitlm (X,y,"continuous",[],"display","off");
-%! [TAB,STATS] = fitlm (X,y,"CategoricalVars",1,"display","off");
-%! [TAB,STATS] = fitlm (X,y,"constant","categorical",1,"display","off");
-%! [TAB,STATS] = fitlm (X,y,"linear","categorical",1,"display","off");
-%! [TAB,STATS] = fitlm (X,y,[0,0;1,0],"categorical",1,"display","off");
-%! assert (TAB{2,2}, 10, 1e-04);
-%! assert (TAB{3,2}, 7.99999999999999, 1e-09);
-%! assert (TAB{4,2}, 8.99999999999999, 1e-09);
-%! assert (TAB{5,2}, 11.0001428571429, 1e-09);
-%! assert (TAB{6,2}, 19.0001111111111, 1e-09);
-%! assert (TAB{2,3}, 1.01775379540949, 1e-09);
-%! assert (TAB{3,3}, 1.64107868458008, 1e-09);
-%! assert (TAB{4,3}, 1.43932122062479, 1e-09);
-%! assert (TAB{5,3}, 1.48983900477565, 1e-09);
-%! assert (TAB{6,3}, 1.3987687997822, 1e-09);
-%! assert (TAB{2,6}, 9.82555903510687, 1e-09);
-%! assert (TAB{3,6}, 4.87484242844031, 1e-09);
-%! assert (TAB{4,6}, 6.25294748040552, 1e-09);
-%! assert (TAB{5,6}, 7.38344399756088, 1e-09);
-%! assert (TAB{6,6}, 13.5834536158296, 1e-09);
-%! assert (TAB{3,7}, 2.85812420217862e-05, 1e-12);
-%! assert (TAB{4,7}, 5.22936741204002e-07, 1e-06);
-%! assert (TAB{5,7}, 2.12794763209106e-08, 1e-07);
-%! assert (TAB{6,7}, 7.82091664406755e-15, 1e-08);
+%! rng (1);
+%! X = randn (10, 2);
+%! y = randn (10, 1);
+%! mdl = fitlm (X, y);
+%! assert (isempty (mdl.Steps));
+## Test 17: Wilkinson formula string input
%!test
-%! popcorn = [5.5, 4.5, 3.5; 5.5, 4.5, 4.0; 6.0, 4.0, 3.0; ...
-%! 6.5, 5.0, 4.0; 7.0, 5.5, 5.0; 7.0, 5.0, 4.5];
-%! brands = bsxfun (@times, ones(6,1), [1,2,3]);
-%! popper = bsxfun (@times, [1;1;1;2;2;2], ones(1,3));
-%!
-%! [TAB, STATS] = fitlm ({brands(:),popper(:)},popcorn(:),"interactions",...
-%! "categoricalvars",[1,2],"display","off");
-%! assert (TAB{2,2}, 5.66666666666667, 1e-09);
-%! assert (TAB{3,2}, -1.33333333333333, 1e-09);
-%! assert (TAB{4,2}, -2.16666666666667, 1e-09);
-%! assert (TAB{5,2}, 1.16666666666667, 1e-09);
-%! assert (TAB{6,2}, -0.333333333333334, 1e-09);
-%! assert (TAB{7,2}, -0.166666666666667, 1e-09);
-%! assert (TAB{2,3}, 0.215165741455965, 1e-09);
-%! assert (TAB{3,3}, 0.304290309725089, 1e-09);
-%! assert (TAB{4,3}, 0.304290309725089, 1e-09);
-%! assert (TAB{5,3}, 0.304290309725089, 1e-09);
-%! assert (TAB{6,3}, 0.43033148291193, 1e-09);
-%! assert (TAB{7,3}, 0.43033148291193, 1e-09);
-%! assert (TAB{2,6}, 26.3362867542108, 1e-09);
-%! assert (TAB{3,6}, -4.38178046004138, 1e-09);
-%! assert (TAB{4,6}, -7.12039324756724, 1e-09);
-%! assert (TAB{5,6}, 3.83405790253621, 1e-09);
-%! assert (TAB{6,6}, -0.774596669241495, 1e-09);
-%! assert (TAB{7,6}, -0.387298334620748, 1e-09);
-%! assert (TAB{2,7}, 5.49841502258254e-12, 1e-09);
-%! assert (TAB{3,7}, 0.000893505495903642, 1e-09);
-%! assert (TAB{4,7}, 1.21291454302428e-05, 1e-09);
-%! assert (TAB{5,7}, 0.00237798044119407, 1e-09);
-%! assert (TAB{6,7}, 0.453570536021938, 1e-09);
-%! assert (TAB{7,7}, 0.705316781644046, 1e-09);
-%! ## Test with string ids for categorical variables
-%! brands = {'Gourmet', 'National', 'Generic'; ...
-%! 'Gourmet', 'National', 'Generic'; ...
-%! 'Gourmet', 'National', 'Generic'; ...
-%! 'Gourmet', 'National', 'Generic'; ...
-%! 'Gourmet', 'National', 'Generic'; ...
-%! 'Gourmet', 'National', 'Generic'};
-%! popper = {'oil', 'oil', 'oil'; 'oil', 'oil', 'oil'; 'oil', 'oil', 'oil'; ...
-%! 'air', 'air', 'air'; 'air', 'air', 'air'; 'air', 'air', 'air'};
-%! [TAB, STATS] = fitlm ({brands(:),popper(:)},popcorn(:),"interactions",...
-%! "categoricalvars",[1,2],"display","off");
+%! rng (1);
+%! X = randn (20, 2);
+%! y = X * [1; 2] + randn (20, 1) * 0.5;
+%! mdl = fitlm (X, y, 'y ~ x1 + x2');
+%! assert (isa (mdl, 'LinearModel'));
+%! assert (mdl.Formula.HasIntercept, true);
+%! assert (strcmp (mdl.Formula.ResponseName, 'y'));
+## Test 18: anova() still works on fitlm output
%!test
-%! load carsmall
-%! X = [Weight,Horsepower,Acceleration];
-%! [TAB, STATS] = fitlm (X, MPG,"constant","display","off");
-%! [TAB, STATS] = fitlm (X, MPG,"linear","display","off");
-%! assert (TAB{2,2}, 47.9767628118615, 1e-09);
-%! assert (TAB{3,2}, -0.00654155878851796, 1e-09);
-%! assert (TAB{4,2}, -0.0429433065881864, 1e-09);
-%! assert (TAB{5,2}, -0.0115826516894871, 1e-09);
-%! assert (TAB{2,3}, 3.87851641748551, 1e-09);
-%! assert (TAB{3,3}, 0.00112741016370336, 1e-09);
-%! assert (TAB{4,3}, 0.0243130608813806, 1e-09);
-%! assert (TAB{5,3}, 0.193325043113178, 1e-09);
-%! assert (TAB{2,6}, 12.369874881944, 1e-09);
-%! assert (TAB{3,6}, -5.80228828790225, 1e-09);
-%! assert (TAB{4,6}, -1.76626492228599, 1e-09);
-%! assert (TAB{5,6}, -0.0599128364487485, 1e-09);
-%! assert (TAB{2,7}, 4.89570341688996e-21, 1e-09);
-%! assert (TAB{3,7}, 9.87424814144e-08, 1e-09);
-%! assert (TAB{4,7}, 0.0807803098213114, 1e-09);
-%! assert (TAB{5,7}, 0.952359384151778, 1e-09);
+%! rng (1);
+%! X = randn (20, 2);
+%! y = X * [1; 2] + randn (20, 1) * 0.5;
+%! mdl = fitlm (X, y);
+%! T = anova (mdl);
+%! assert (isstruct (T));
diff --git a/inst/grp2idx.m b/inst/grp2idx.m
index ae7ed87b..ed376491 100644
--- a/inst/grp2idx.m
+++ b/inst/grp2idx.m
@@ -1,5 +1,6 @@
## Copyright (C) 2015 Carnë Draug
## Copyright (C) 2022-2026 Andreas Bertsatos
+## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com>
##
## This file is part of the statistics package for GNU Octave.
##
@@ -67,6 +68,20 @@
if (nargin != 1)
print_usage ();
endif
+
+ ## Handle empty input immediately
+ if (isempty (s))
+ if (! isvector (s) && ! ischar (s) && ! isstring (s) && ! iscellstr (s) && ! iscategorical (s))
+ error ("grp2idx: Grouping variable must be a vector, character array, string array, or cell array of character vectors.");
+ endif
+ g = [];
+ gn = {};
+ if (nargout > 2)
+ gl = [];
+ endif
+ return;
+ endif
+
if (ndims (s) != 2)
error ("grp2idx: S must be either a vector or a matrix.");
endif
@@ -82,10 +97,36 @@
error ("grp2idx: 'categorical' grouping variable must be a vector.");
endif
is_categorical = true;
- undef = isundefined (s);
cats = categories (s);
- s = cellstr (s);
- s(undef) = {""};
+ s_text = cellstr (s(:));
+ g = NaN (numel (s_text), 1);
+
+ if (nargout > 1)
+ if (isempty (cats))
+ gn = cell (0, 1);
+ else
+ gn = cellstr (cats);
+ endif
+ endif
+
+ if (nargout > 2)
+ if (isempty (cats))
+ gl = categorical (cell (0, 1));
+ else
+ gl = categorical (cellstr (cats), cellstr (cats), cellstr (cats));
+ endif
+ endif
+
+ if (isempty (cats))
+ return;
+ endif
+
+ cat_names = cellstr (cats);
+ undef = isundefined (s(:));
+ for i = 1:numel (cat_names)
+ g(! undef & strcmp (s_text, cat_names{i})) = i;
+ endfor
+ return;
elseif (ischar (s))
is_char_array = true;
s = cellstr (s);
@@ -121,16 +162,7 @@
[gl, I, g] = unique (s(:));
## Fix order in here, since unique does not support this yet
if (iscellstr (s) && ! is_categorical)
- I = sort (I);
- for i = 1:length (gl)
- gl_s(i) = gl(g(I(i)));
- idx(i,:) = (g == g(I(i)));
- endfor
- for i = 1:length (gl)
- g(idx(i,:)) = i;
- endfor
- gl = gl_s;
- gl = gl';
+ [g, gl] = g2i_reindex_stable (g, gl, I);
endif
## handle NaNs and empty strings
@@ -208,6 +240,24 @@
endfunction
+function [g, gl] = g2i_reindex_stable (g, gl, first_idx)
+
+ first_idx = sort (first_idx(:));
+ ng = numel (gl);
+ new_gl = cell (ng, 1);
+ new_g = g;
+
+ for i = 1:ng
+ old_idx = g(first_idx(i));
+ new_gl{i} = gl{old_idx};
+ new_g(g == old_idx) = i;
+ endfor
+
+ gl = new_gl;
+ g = new_g;
+
+endfunction
+
# test for one output argument
%!test
%! g = grp2idx ([3 2 1 2 3 1]);
@@ -390,6 +440,13 @@
%! assert (iscategorical (gl), true);
%! assert (cellstr (gl), gn);
+%!test
+%! s = categorical ({'a'; 'b'; 'a'}, {'b', 'a', 'c'}, {'b', 'a', 'c'});
+%! [g, gn, gl] = grp2idx (s);
+%! assert (g, [2; 1; 2]);
+%! assert (gn, {'b'; 'a'; 'c'});
+%! assert (cellstr (gl), gn);
+
## test for duration arrays
%!test
%! g = gn = gl = [];
diff --git a/inst/parseWilkinsonFormula.m b/inst/parseWilkinsonFormula.m
index 703c260c..f0a3d6fe 100644
--- a/inst/parseWilkinsonFormula.m
+++ b/inst/parseWilkinsonFormula.m
@@ -1,5 +1,6 @@
## Copyright (C) 2026 Andreas Bertsatos
## Copyright (C) 2026 Avanish Salunke
+## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com>
##
## This file is part of the statistics package for GNU Octave.
##
@@ -201,7 +202,7 @@
else
lhs_vars = resolve_lhs_symbolic (lhs_str);
endif
-
+
## build the required output.
varargout{1} = run_equation_builder (lhs_vars, rhs_terms);
@@ -519,14 +520,14 @@
args_str_parts = {};
for k = 1:length (node.args)
arg_res = run_expander (node.args{k}, mode);
-
+
if (! isempty (arg_res) && ! isempty (arg_res{1}))
- args_str_parts{end+1} = arg_res{1}{1};
+ args_str_parts{end+1} = arg_res{1}{1};
else
args_str_parts{end+1} = '';
endif
endfor
-
+
full_term = sprintf ("%s(%s)", node.name, strjoin (args_str_parts, ','));
result = {{full_term}};
else
@@ -774,11 +775,8 @@
endif
endfunction
- ## extract variables
+ ## extract variables — collect RHS predictors first, then LHS response
all_vars = {};
- if (! isempty (lhs_term))
- all_vars = [all_vars, flatten_recursive(lhs_term)];
- endif
cleaned_rhs = cell (length (rhs_terms), 1);
for i = 1:length (rhs_terms)
@@ -789,11 +787,26 @@
final_term_vars = [final_term_vars, parts];
endfor
cleaned_rhs{i} = final_term_vars;
- all_vars = [all_vars, final_term_vars];
+ ## Add only the BASE variable name to all_vars (strip ^k suffix)
+ for j = 1:length (final_term_vars)
+ tv = final_term_vars{j};
+ caret = strfind (tv, '^');
+ if (! isempty (caret))
+ all_vars{end+1} = tv(1:caret(1)-1);
+ else
+ all_vars{end+1} = tv;
+ endif
+ endfor
endfor
- all_vars = unique (all_vars);
- ## Remove intercept marker from var list
+ ## Append LHS (response) variables after RHS predictors
+ if (! isempty (lhs_term))
+ all_vars = [all_vars, flatten_recursive(lhs_term)];
+ endif
+
+ ## Stable dedup (preserves first-seen order)
+ [~, ui] = unique (all_vars, 'stable');
+ all_vars = all_vars(sort (ui));
all_vars(strcmp (all_vars, '1')) = [];
schema.VariableNames = all_vars;
@@ -808,7 +821,7 @@
endif
endif
- ## Build terms matrix
+ ## Build terms matrix (stores exponents, not just binary 1)
n_vars = length (all_vars);
n_terms = length (cleaned_rhs);
terms_mat = zeros (n_terms, n_vars);
@@ -822,22 +835,46 @@
continue;
endif
- [found, idx] = ismember (vars_in_this_term, all_vars);
- if (any (! found))
- error ("parseWilkinsonFormula: Unknown variable in term definition.");
- endif
- terms_mat(i, idx) = 1;
+ for jj = 1:length (vars_in_this_term)
+ tv = vars_in_this_term{jj};
+ caret = strfind (tv, '^');
+ if (! isempty (caret))
+ base_name = tv(1:caret(1)-1);
+ exp_val = str2double (tv(caret(1)+1:end));
+ else
+ base_name = tv;
+ exp_val = 1;
+ endif
+ col = find (strcmp (all_vars, base_name));
+ if (isempty (col))
+ error ("parseWilkinsonFormula: Unknown variable '%s' in term.", ...
+ base_name);
+ endif
+ terms_mat(i, col) = exp_val;
+ endfor
endfor
- ## sorting : order by order.
- term_orders = sum (terms_mat, 2);
- M = [term_orders, terms_mat];
+ ## sorting: order by (1) number of active variables ascending,
+ ## (2) binary mask descending (x1 before x2 before x3),
+ ## (3) exponent values ascending (x before x^2)
+ term_orders = sum (terms_mat > 0, 2);
+ binary_mask = double (terms_mat > 0);
+ M = [term_orders, binary_mask, terms_mat];
- [~, unique_idx] = unique (M, 'rows');
+ ## Create unique rows based on actual terms
+ [~, unique_idx] = unique ([term_orders, terms_mat], 'rows');
terms_mat = terms_mat (unique_idx, :);
+ cleaned_rhs = cleaned_rhs(unique_idx);
+ M = M (unique_idx, :);
+
+ n_v = size (terms_mat, 2);
+ ## Direction: [asc on order, desc on binary mask, asc on exponents]
+ sort_dirs = [1, -(2:1+n_v), (2+n_v:1+2*n_v)];
- [~, sort_idx] = sortrows ([sum(terms_mat, 2), terms_mat]);
+ ## Sort using the direction vector
+ [~, sort_idx] = sortrows (M, sort_dirs);
schema.Terms = terms_mat (sort_idx, :);
+ schema.TermTokens = cleaned_rhs(sort_idx);
endfunction
@@ -921,6 +958,19 @@
endif
endfor
+ ## Reorder schema.VariableNames to match table column order (Fix B4)
+ tbl_col_order = data.Properties.VariableNames;
+ [~, pos] = ismember (schema.VariableNames, tbl_col_order);
+ pos(pos == 0) = length (tbl_col_order) + 1;
+ [~, reorder_idx] = sort (pos);
+ schema.VariableNames = schema.VariableNames(reorder_idx);
+ schema.Terms = schema.Terms(:, reorder_idx);
+ if (! isempty (schema.ResponseIdx))
+ resp_name = req_vars{schema.ResponseIdx};
+ schema.ResponseIdx = find (strcmp (schema.VariableNames, resp_name));
+ endif
+ req_vars = schema.VariableNames;
+
## Build Design Matrix X
X = [];
col_names = {};
@@ -930,6 +980,7 @@
has_intercept = ! isempty (intercept_row_idx);
n_terms = size (schema.Terms, 1);
+ cleaned_rhs_terms = schema.TermTokens;
for i = 1:n_terms
term_row = schema.Terms(i, :);
@@ -942,20 +993,53 @@
continue;
endif
+ ## Sort vars_idx: numeric variables before categorical (Fix B5)
+ if (length (vars_idx) > 1)
+ is_cat = arrayfun (@(v) strcmp (var_info.(req_vars{v}).type, ...
+ 'categorical'), vars_idx);
+ [~, sort_order] = sort (is_cat);
+ vars_idx = vars_idx(sort_order);
+ endif
+
current_block = ones (n_rows, 1);
current_names = {''};
+ ## Resolve term tokens for polynomial support (Fix B3)
+ term_tokens = cleaned_rhs_terms{i};
+ token_idx = 0;
+
for v = vars_idx
- vname = req_vars{v};
+ token_idx = token_idx + 1;
+ if (! isempty (term_tokens) && token_idx <= length (term_tokens))
+ term_token = term_tokens{token_idx};
+ else
+ term_token = req_vars{v};
+ endif
+
+ caret = strfind (term_token, '^');
+ if (! isempty (caret))
+ vname = term_token(1:caret(1)-1);
+ else
+ vname = req_vars{v};
+ endif
info = var_info.(vname);
if (strcmp (info.type, 'numeric'))
- current_block = current_block .* info.data;
+ ## Resolve polynomial exponent from term token
+ if (! isempty (caret))
+ exp_val = str2double (term_token(caret(1)+1:end));
+ col_data = info.data .^ exp_val;
+ disp_name = term_token;
+ else
+ col_data = info.data;
+ disp_name = vname;
+ endif
+ current_block = current_block .* col_data;
for k = 1:length (current_names)
if (isempty (current_names{k}))
- current_names{k} = vname;
+ current_names{k} = disp_name;
else
- current_names{k} = [current_names{k}, ':', vname];
+ current_names{k} = [current_names{k}, ':', disp_name];
endif
endfor
else
@@ -1140,14 +1224,14 @@
## process RHS
rhs_tokens = run_lexer (rhs_str);
[rhs_tree, ~] = run_parser (rhs_tokens);
-
+
wrapper.type = 'OPERATOR';
wrapper.value = '~';
wrapper.left = [];
wrapper.right = rhs_tree;
-
+
expanded = run_expander (wrapper, mode);
-
+
## extract the terms.
if (isstruct (expanded) && isfield (expanded, 'model'))
rhs_terms = expanded.model;
@@ -1164,22 +1248,22 @@
for i = 1:length (parts)
p = strtrim (parts{i});
if (isempty (p)), continue; endif
-
+
range_parts = strsplit (p, '-');
-
+
if (length (range_parts) == 2)
s_str = strtrim (range_parts{1});
e_str = strtrim (range_parts{2});
-
+
[s_tok] = regexp (s_str, '^([a-zA-Z_]\w*)(\d+)$', 'tokens');
[e_tok] = regexp (e_str, '^([a-zA-Z_]\w*)(\d+)$', 'tokens');
-
+
if (! isempty (s_tok) && ! isempty (e_tok))
prefix = s_tok{1}{1};
s_num = str2double (s_tok{1}{2});
e_prefix = e_tok{1}{1};
e_num = str2double (e_tok{1}{2});
-
+
if (strcmp (prefix, e_prefix) && s_num <= e_num)
for n = s_num:e_num
vars{end+1} = sprintf ("%s%d", prefix, n);
@@ -1203,7 +1287,7 @@
for i = 1:length (rhs_terms)
t = rhs_terms{i};
if (isempty (t))
- term_strs{end+1} = '';
+ term_strs{end+1} = '';
else
if (length (t) == 1 && any (strfind (t{1}, "(")))
term_strs{end+1} = t{1};
@@ -1229,13 +1313,13 @@
rhs_parts{end+1} = sprintf ("%s*%s", coeff, t_str);
endif
endfor
-
+
full_rhs = strjoin (rhs_parts, ' + ');
if (isempty (full_rhs)), full_rhs = '0'; endif
lines{end+1} = sprintf ("%s = %s", lhs_vars{k}, full_rhs);
endfor
- eq_list = string (lines');
+ eq_list = string (lines');
endfunction
%!demo
@@ -1271,7 +1355,7 @@
%!demo
%!
-%! ## Interaction Effects :
+%! ## Interaction Effects :
%! ## We analyze Relief Score based on Drug Type and Dosage Level.
%! ## The '*' operator expands to the main effects PLUS the interaction term.
%! ## Categorical variables are automatically created.
@@ -1287,11 +1371,11 @@
%!demo
%!
-%! ## Polynomial Regression :
+%! ## Polynomial Regression :
%! ## Uses the power operator (^) to model non-linear relationships.
%! Distance = [20; 45; 80; 125];
%! Speed = [30; 50; 70; 90];
-%! Speed_2 = Speed .^ 2;
+%! Speed_2 = Speed .^ 2;
%! t = table (Distance, Speed, Speed_2, 'VariableNames', {'Distance', 'Speed', 'Speed^2'});
%!
%! formula = 'Distance ~ Speed^2';
@@ -1316,7 +1400,7 @@
%!demo
%!
-%! ## Explicit Nesting :
+%! ## Explicit Nesting :
%! ## The parser also supports the explicit 'B(A)' syntax, which means
%! ## 'B is nested within A'. This is equivalent to the interaction 'A:B'
%! ## but often used to denote random effects or specific hierarchy.
@@ -1327,7 +1411,7 @@
%!demo
%!
-%! ## Excluding Terms :
+%! ## Excluding Terms :
%! ## Demonstrates building a complex model and then simplifying it.
%! ## We define a full 3-way interaction (A*B*C) but explicitly remove the
%! ## three-way term (A:B:C) using the minus operator.
@@ -1338,7 +1422,7 @@
%!demo
%!
-%! ## Repeated Measures :
+%! ## Repeated Measures :
%! ## This allows predicting multiple outcomes simultaneously.
%! ## The range operator '-' selects all variables between 'T1' and 'T3'
%! ## as the response matrix Y.
@@ -1487,7 +1571,7 @@
%! C = {'lo'; 'hi'};
%! d = table (y, N, C);
%! [M, ~, names] = parseWilkinsonFormula ('~ N * C', 'model_matrix', d);
-%! assert (any (strcmp (names, 'C_lo:N')));
+%! assert (any (strcmp (names, 'N:C_lo')));
%!test
%! ## Test : Intercept Only Model
%! y = [1; 2; 3];
@@ -1659,6 +1743,48 @@
%! eq = parseWilkinsonFormula ('y ~ A - A', 'equation');
%! expected = string('y = c1');
%! assert (isequal (eq, expected));
+%!test
+%! ## Verify parseWilkinsonFormula schema matches MATLAB fitlm sorting
+%! formula = 'Y ~ x1 * x2 * x3';
+%! schema = parseWilkinsonFormula(formula, 'matrix');
+%!
+%! ## Columns: predictors first in formula order, response last
+%! expected_terms = [0, 0, 0, 0; ## (Intercept)
+%! 1, 0, 0, 0; ## x1
+%! 0, 1, 0, 0; ## x2
+%! 0, 0, 1, 0; ## x3
+%! 1, 1, 0, 0; ## x1:x2
+%! 1, 0, 1, 0; ## x1:x3
+%! 0, 1, 1, 0; ## x2:x3
+%! 1, 1, 1, 0]; ## x1:x2:x3
+%!
+%! assert(schema.VariableNames, {'x1', 'x2', 'x3', 'Y'});
+%! assert(schema.Terms, expected_terms);
+%!test
+%! ## polynomial x^2 produces base var with exponent in terms matrix
+%! s = parseWilkinsonFormula ('y ~ x^2', 'matrix');
+%! assert (s.VariableNames, {'x', 'y'});
+%! assert (s.Terms, [0 0; 1 0; 2 0]);
+%!test
+%! ## polynomial model_matrix computes x.^2 column
+%! x = (1:5)'; y = x .^ 2;
+%! t = table (x, y);
+%! [X, ~, names] = parseWilkinsonFormula ('y ~ x^2', 'model_matrix', t);
+%! assert (names, {'(Intercept)'; 'x'; 'x^2'});
+%! assert (X(:,2), x);
+%! assert (X(:,3), x .^ 2);
+%!test
+%! ## variable ordering follows table column order
+%! Age = [25;30]; Weight = [70;75]; BP = [120;122];
+%! t = table (Age, Weight, BP);
+%! [~, ~, n] = parseWilkinsonFormula ('BP ~ Age * Weight', 'model_matrix', t);
+%! assert (n, {'(Intercept)'; 'Age'; 'Weight'; 'Age:Weight'});
+%!test
+%! ## numeric columns named before categorical in interactions
+%! N = [10;20]; C = {'lo';'hi'}; y = [1;2];
+%! d = table (N, C, y);
+%! [~, ~, n] = parseWilkinsonFormula ('y ~ N * C', 'model_matrix', d);
+%! assert (n, {'(Intercept)'; 'N'; 'C_lo'; 'N:C_lo'});
%!error parseWilkinsonFormula ()
%!error parseWilkinsonFormula ('y ~ x', 'invalid_mode')
%!error parseWilkinsonFormula ('', 'parse')
diff --git a/inst/private/__fitlm_build_design__.m b/inst/private/__fitlm_build_design__.m
new file mode 100644
index 00000000..08610bed
--- /dev/null
+++ b/inst/private/__fitlm_build_design__.m
@@ -0,0 +1,754 @@
+## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com>
+##
+## This file is part of the statistics package for GNU Octave.
+##
+## This program is free software; you can redistribute it and/or modify it under
+## the terms of the GNU General Public License as published by the Free Software
+## Foundation; either version 3 of the License, or (at your option) any later
+## version.
+##
+## This program is distributed in the hope that it will be useful, but WITHOUT
+## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+## details.
+##
+## You should have received a copy of the GNU General Public License along with
+## this program; if not, see .
+
+## -*- texinfo -*-
+## @deftypefn {Private Function} {[@var{terms_mat}, @var{coef_names}, @var{X_full}, @
+## @var{y_full}, @var{incl_mask}, @var{p_tot}, @var{has_intercept}, @
+## @var{lp_str}] =} __fitlm_build_design__ (@var{raw_X}, @var{raw_y}, @
+## @var{modelspec}, @var{var_names}, @var{cat_flags}, @var{weights}, @
+## @var{excl_mask}, @var{pred_names}, @var{raw_cols}, @var{has_intercept})
+##
+## Build design matrix and terms matrix from parsed fitlm inputs.
+##
+## Returns:
+## terms_mat - (n_terms x n_all_vars) binary terms matrix (last col = response = 0)
+## coef_names - 1 x p_tot cell of char coefficient names
+## X_full - n x p_tot design matrix (ALL rows, NaN for missing rows)
+## y_full - n x 1 response (NaN for missing rows)
+## incl_mask - n x 1 logical: rows used in fit
+## p_tot - total columns in design matrix
+## has_intercept - logical
+## lp_str - LinearPredictor RHS string
+## @end deftypefn
+
+function [terms_mat, coef_names, X_full, y_full, incl_mask, p_tot, has_intercept, lp_str] = ...
+ __fitlm_build_design__ (raw_X, raw_y, modelspec, var_names, cat_flags, ...
+ weights, excl_mask, pred_names, raw_cols, ...
+ has_intercept, dummy_coding)
+
+ if (nargin < 11 || isempty (dummy_coding))
+ dummy_coding = "reference";
+ endif
+
+ n = rows (raw_X);
+ p = columns (raw_X); ## number of predictor VARIABLES
+ n_vars = p + 1; ## predictors + response
+
+ ## ── Step 1: Resolve shorthand → Terms matrix ────────────────────────────
+ if (isnumeric (modelspec) && ismatrix (modelspec) && ndims (modelspec) == 2)
+ ## Numeric terms matrix passed directly (e.g., from stepwiselm)
+ terms_pred = modelspec;
+ ## If width = p+1 (includes response column), strip last column
+ if (columns (terms_pred) == n_vars)
+ terms_pred = terms_pred(:, 1:p);
+ endif
+ ## Detect intercept from the terms matrix
+ has_intercept = any (all (terms_pred == 0, 2));
+ else
+ is_formula = ! isempty (strfind (modelspec, "~"));
+
+ if (is_formula)
+ ## Formula string — parse response name and RHS
+ tilde = strfind (modelspec, "~");
+ rhs = strtrim (modelspec(tilde(1)+1:end));
+ ## Check for unsupported C() notation
+ if (! isempty (regexp (rhs, 'C\s*\(')))
+ error ("Unable to understand the character vector or string scalar '%s'.", modelspec);
+ endif
+ ## Check for intercept suppression
+ if (! isempty (regexp (rhs, '(^|\s|[+])\s*-\s*1\b')))
+ has_intercept = false;
+ rhs = regexprep (rhs, '\s*[-]\s*1\b', '');
+ rhs = strtrim (rhs);
+ endif
+ ## Parse the RHS terms
+ terms_pred = __parse_rhs_terms__ (rhs, pred_names, has_intercept);
+ else
+ ## Shorthand string
+ terms_pred = __shorthand_to_terms__ (lower (modelspec), p, has_intercept);
+ endif
+ endif
+
+
+ ## terms_pred: (n_terms x p) — predictor columns only
+ ## Expand to include response column (always 0)
+ terms_mat = [terms_pred, zeros(rows(terms_pred), 1)];
+
+ ## ── Step 2: Build design matrix column by column ─────────────────────────
+ ##
+ ## We iterate over term rows. For each term:
+ ## - intercept row (all 0): add ones column
+ ## - linear/squared numeric: add X^power column
+ ## - categorical main effect: add k-1 dummy columns
+ ## - interaction: element-wise product of factor block columns
+
+ ## Pre-compute per-predictor "base columns" (for numerics) and
+ ## "dummy block" (for categoricals)
+ base_cols = cell (1, p); ## each cell = n x something
+ base_cnames = cell (1, p); ## cell of cell of char
+ cat_levels = cell (1, p); ## sorted unique levels for each cat predictor
+
+ for k = 1:p
+ if (cat_flags(k))
+ ## Categorical predictor — build dummy columns (reference = first level)
+ col_raw = raw_cols{k};
+ if (isa (col_raw, "categorical"))
+ levs = cellstr (categories (col_raw));
+ [~, cat_idx] = ismember (cellstr (col_raw), levs);
+ elseif (iscellstr (col_raw) || isstring (col_raw))
+ if (isstring (col_raw))
+ col_raw = cellstr (col_raw);
+ endif
+ levs = unique (col_raw);
+ levs = sort (levs);
+ [~, ~, cat_idx] = unique (col_raw);
+ ## Re-sort so idx matches sorted levs
+ [levs2, ~, cat_idx] = unique (col_raw);
+ levs = levs2;
+ else
+ ## Numeric forced categorical
+ vals = raw_X(:, k);
+ levs_num = unique (vals(! isnan (vals)));
+ levs_num = sort (levs_num);
+ levs = arrayfun (@num2str, levs_num, "UniformOutput", false);
+ cat_idx = zeros (n, 1);
+ for li = 1:numel (levs_num)
+ cat_idx(vals == levs_num(li)) = li;
+ endfor
+ endif
+ cat_levels{k} = levs;
+ n_lev = numel (levs);
+ ## Dummy coding: 'reference' drops first level, 'full' keeps all
+ if (strcmp (dummy_coding, "full"))
+ n_dum = n_lev;
+ D = zeros (n, n_dum);
+ dum_names = cell (1, n_dum);
+ for L = 1:n_lev
+ D(:, L) = double (cat_idx == L);
+ if (isnumeric (levs))
+ lev_str = num2str (levs(L));
+ else
+ lev_str = levs{L};
+ endif
+ dum_names{L} = sprintf ("%s_%s", pred_names{k}, lev_str);
+ endfor
+ else
+ ## Reference coding: drop first level
+ n_dum = n_lev - 1;
+ D = zeros (n, n_dum);
+ dum_names = cell (1, n_dum);
+ for L = 2:n_lev
+ D(:, L-1) = double (cat_idx == L);
+ if (isnumeric (levs))
+ lev_str = num2str (levs(L));
+ else
+ lev_str = levs{L};
+ endif
+ dum_names{L-1} = sprintf ("%s_%s", pred_names{k}, lev_str);
+ endfor
+ endif
+ base_cols{k} = D;
+ base_cnames{k} = dum_names;
+ else
+ ## Numeric predictor — single column; power applied per-term
+ base_cols{k} = raw_X(:, k);
+ base_cnames{k} = {pred_names{k}};
+ endif
+ endfor
+
+ ## Now build X_full column by column from terms_pred
+ X_full = [];
+ coef_names = {};
+
+ for i = 1:rows (terms_pred)
+ row = terms_pred(i, :);
+
+ if (all (row == 0))
+ ## Intercept
+ X_full = [X_full, ones(n, 1)];
+ coef_names{end+1} = "(Intercept)";
+ continue;
+ endif
+
+ ## Find active predictors in this term
+ active = find (row);
+ ## Start with a block of ones × empty name
+ blk = ones (n, 1);
+ blk_n = {""};
+
+ for ai = 1:numel (active)
+ v = active(ai);
+ pwr = row(v);
+
+ if (cat_flags(v))
+ ## Categorical: Cartesian product with dummy block
+ D = base_cols{v}; ## n x (n_lev-1)
+ dn = base_cnames{v}; ## 1 x (n_lev-1) cell
+ new_blk = [];
+ new_blk_n = {};
+ for c1 = 1:columns (blk)
+ for c2 = 1:columns (D)
+ new_blk = [new_blk, blk(:, c1) .* D(:, c2)];
+ n1 = blk_n{c1};
+ n2 = dn{c2};
+ if (isempty (n1))
+ new_blk_n{end+1} = n2;
+ else
+ new_blk_n{end+1} = [n1, ":", n2];
+ endif
+ endfor
+ endfor
+ blk = new_blk;
+ blk_n = new_blk_n;
+ else
+ ## Numeric: multiply by column^power
+ col = base_cols{v};
+ if (pwr > 1)
+ col = col .^ pwr;
+ cname = sprintf ("%s^%d", pred_names{v}, pwr);
+ else
+ cname = pred_names{v};
+ endif
+ blk = bsxfun (@times, blk, col);
+ for c1 = 1:numel (blk_n)
+ if (isempty (blk_n{c1}))
+ blk_n{c1} = cname;
+ else
+ blk_n{c1} = [blk_n{c1}, ":", cname];
+ endif
+ endfor
+ endif
+ endfor
+
+ X_full = [X_full, blk];
+ coef_names = [coef_names, blk_n];
+ endfor
+
+ ## ── Step 3: Compute incl_mask ───────────────────────────────────────────
+ ## Missing: any NaN in the NUMERIC data (raw_X numeric cols or raw_y)
+ nan_in_X = any (isnan (raw_X), 2);
+ nan_in_y = isnan (raw_y);
+ ## For categorical columns stored as codes, NaN means missing
+ missing_mask = nan_in_X | nan_in_y;
+
+ incl_mask = (! excl_mask) & (! missing_mask);
+ y_full = raw_y;
+
+ p_tot = columns (X_full);
+
+ ## ── Step 4: Build LinearPredictor string ────────────────────────────────
+ lp_str = __build_lp_str__ (terms_pred, pred_names, has_intercept);
+
+endfunction
+
+
+## ── Shorthand → terms_pred (n_terms x p) ────────────────────────────────────
+
+function terms = __shorthand_to_terms__ (spec, p, has_intercept)
+ ## Returns (n_terms x p) matrix (no response column yet)
+ switch (spec)
+ case "constant"
+ if (has_intercept)
+ terms = zeros (1, p);
+ else
+ terms = zeros (0, p);
+ endif
+
+ case "linear"
+ if (has_intercept)
+ terms = [zeros(1, p); eye(p)];
+ else
+ terms = eye (p);
+ endif
+
+ case "interactions"
+ lin_terms = eye (p);
+ inter_rows = [];
+ for i = 1:p-1
+ for j = i+1:p
+ r = zeros (1, p);
+ r(i) = 1; r(j) = 1;
+ inter_rows = [inter_rows; r];
+ endfor
+ endfor
+ if (has_intercept)
+ terms = [zeros(1,p); lin_terms; inter_rows];
+ else
+ terms = [lin_terms; inter_rows];
+ endif
+
+ case "purequadratic"
+ lin_terms = eye (p);
+ quad_rows = 2 * eye (p);
+ if (has_intercept)
+ terms = [zeros(1,p); lin_terms; quad_rows];
+ else
+ terms = [lin_terms; quad_rows];
+ endif
+
+ case {"quadratic", "full"}
+ lin_terms = eye (p);
+ inter_rows = [];
+ for i = 1:p-1
+ for j = i+1:p
+ r = zeros (1, p);
+ r(i) = 1; r(j) = 1;
+ inter_rows = [inter_rows; r];
+ endfor
+ endfor
+ quad_rows = 2 * eye (p);
+ if (has_intercept)
+ terms = [zeros(1,p); lin_terms; inter_rows; quad_rows];
+ else
+ terms = [lin_terms; inter_rows; quad_rows];
+ endif
+
+ otherwise
+ ## Check for polyNM pattern (e.g., 'poly22', 'poly33')
+ m = regexp (spec, '^poly(\d+)$', 'tokens');
+ if (! isempty (m))
+ digits_str = m{1}{1};
+ if (numel (digits_str) != p)
+ error ("fitlm: 'poly%s' requires %d digits for %d predictors.", ...
+ digits_str, p, p);
+ endif
+ max_powers = arrayfun (@(c) str2double (c), digits_str);
+ terms = __poly_to_terms__ (max_powers, p, has_intercept);
+ else
+ ## Unknown — default to linear
+ if (has_intercept)
+ terms = [zeros(1, p); eye(p)];
+ else
+ terms = eye (p);
+ endif
+ endif
+ endswitch
+endfunction
+
+
+## ── Formula RHS → terms_pred ─────────────────────────────────────────────────
+
+function terms = __parse_rhs_terms__ (rhs, pred_names, has_intercept)
+ ## Parse a Wilkinson RHS string with full operator support:
+ ## +, -, *, /, :, ^, (group)^N
+ ## Returns (n_terms x p) matrix
+ p = numel (pred_names);
+
+ ## Split into positive and negative terms (respecting parentheses)
+ [add_toks, sub_toks, has_int_in_rhs, remove_int] = __split_additive__ (rhs);
+
+ ## Expand each positive token into term rows
+ add_rows = {};
+ for i = 1:numel (add_toks)
+ expanded = __expand_token__ (add_toks{i}, pred_names, p);
+ if (isempty (expanded))
+ error ("The model formula contains names not recognized as predictor or response names.");
+ endif
+ add_rows = [add_rows, expanded];
+ endfor
+
+ ## Remove subtracted terms
+ for i = 1:numel (sub_toks)
+ expanded = __expand_token__ (sub_toks{i}, pred_names, p);
+ for j = 1:numel (expanded)
+ add_rows = __remove_row__ (add_rows, expanded{j});
+ endfor
+ endfor
+
+ ## Deduplicate
+ add_rows = __dedup_rows__ (add_rows);
+
+ ## Sort by MATLAB canonical order: (total_degree, max_power, first_var_index)
+ if (numel (add_rows) > 1)
+ sort_keys = zeros (numel (add_rows), 3);
+ for i = 1:numel (add_rows)
+ row = add_rows{i};
+ sort_keys(i, 1) = sum (row); ## total degree
+ sort_keys(i, 2) = max (row); ## max individual power
+ nz = find (row > 0);
+ if (! isempty (nz))
+ sort_keys(i, 3) = nz(1); ## first nonzero variable index
+ endif
+ endfor
+ [~, ord] = sortrows (sort_keys);
+ add_rows = add_rows(ord);
+ endif
+
+ ## Assemble terms matrix
+ n_terms = numel (add_rows);
+ terms_body = zeros (n_terms, p);
+ for i = 1:n_terms
+ terms_body(i, :) = add_rows{i};
+ endfor
+
+ ## Handle intercept
+ if (remove_int)
+ has_intercept = false;
+ endif
+ if (has_intercept || has_int_in_rhs)
+ terms = [zeros(1, p); terms_body];
+ else
+ terms = terms_body;
+ endif
+endfunction
+
+
+## ── LinearPredictor string builder ───────────────────────────────────────────
+
+function s = __build_lp_str__ (terms_pred, pred_names, has_intercept)
+ parts = {};
+ for i = 1:rows (terms_pred)
+ row = terms_pred(i, :);
+ if (all (row == 0))
+ parts{end+1} = "1";
+ else
+ active = find (row);
+ tparts = {};
+ for j = 1:numel (active)
+ v = active(j);
+ pwr = row(v);
+ if (pwr == 1)
+ tparts{end+1} = pred_names{v};
+ else
+ tparts{end+1} = sprintf ("%s^%d", pred_names{v}, pwr);
+ endif
+ endfor
+ parts{end+1} = strjoin (tparts, ":");
+ endif
+ endfor
+ s = strjoin (parts, " + ");
+endfunction
+
+
+## ── Split RHS on + and - (respecting parentheses) ───────────────────────────
+
+function [add_toks, sub_toks, has_int, remove_int] = __split_additive__ (rhs)
+ add_toks = {};
+ sub_toks = {};
+ has_int = false;
+ remove_int = false;
+ rhs = strtrim (rhs);
+ if (isempty (rhs))
+ return;
+ endif
+ depth = 0;
+ current = '';
+ sign = 1;
+ for i = 1:numel (rhs)
+ ch = rhs(i);
+ if (ch == '(')
+ depth++;
+ current = [current, ch];
+ elseif (ch == ')')
+ depth--;
+ current = [current, ch];
+ elseif (depth == 0 && (ch == '+' || ch == '-'))
+ tok = strtrim (current);
+ if (! isempty (tok))
+ if (strcmp (tok, '1'))
+ if (sign == -1), remove_int = true; else, has_int = true; endif
+ else
+ if (sign == -1), sub_toks{end+1} = tok; else, add_toks{end+1} = tok; endif
+ endif
+ endif
+ current = '';
+ if (ch == '+'), sign = 1; else, sign = -1; endif
+ else
+ current = [current, ch];
+ endif
+ endfor
+ tok = strtrim (current);
+ if (! isempty (tok))
+ if (strcmp (tok, '1'))
+ if (sign == -1), remove_int = true; else, has_int = true; endif
+ else
+ if (sign == -1), sub_toks{end+1} = tok; else, add_toks{end+1} = tok; endif
+ endif
+ endif
+endfunction
+
+
+## ── Expand a single token into term rows ─────────────────────────────────────
+
+function rows = __expand_token__ (tok, pred_names, p)
+ tok = strtrim (tok);
+ rows = {};
+
+ ## Case 1: (group)^N
+ m = regexp (tok, '^\((.+)\)\^(\d+)$', 'tokens');
+ if (! isempty (m))
+ inner_str = m{1}{1};
+ power = str2double (m{1}{2});
+ inner_toks = strtrim (strsplit (inner_str, '+'));
+ inner_vars = [];
+ for i = 1:numel (inner_toks)
+ idx = find (strcmp (pred_names, strtrim (inner_toks{i})));
+ if (! isempty (idx))
+ inner_vars(end+1) = idx;
+ endif
+ endfor
+ if (! isempty (inner_vars))
+ rows = __group_power_expand__ (inner_vars, power, p);
+ endif
+ return;
+ endif
+
+ ## Case 2: bare (group)
+ m = regexp (tok, '^\((.+)\)$', 'tokens');
+ if (! isempty (m))
+ inner_str = m{1}{1};
+ [add_t, sub_t, ~, ~] = __split_additive__ (inner_str);
+ for i = 1:numel (add_t)
+ tmp_exp = __expand_token__ (add_t{i}, pred_names, p);
+ rows = [rows, tmp_exp];
+ endfor
+ for i = 1:numel (sub_t)
+ sub_r = __expand_token__ (sub_t{i}, pred_names, p);
+ for j = 1:numel (sub_r)
+ rows = __remove_row__ (rows, sub_r{j});
+ endfor
+ endfor
+ rows = __dedup_rows__ (rows);
+ return;
+ endif
+
+ ## Case 3: crossing (*)
+ if (__has_at_toplevel__ (tok, '*'))
+ parts = __split_toplevel__ (tok, '*');
+ result = __expand_token__ (parts{1}, pred_names, p);
+ for i = 2:numel (parts)
+ right = __expand_token__ (parts{i}, pred_names, p);
+ result = __crossing_rows__ (result, right);
+ endfor
+ rows = result;
+ return;
+ endif
+
+ ## Case 4: nesting (/)
+ if (__has_at_toplevel__ (tok, '/'))
+ parts = __split_toplevel__ (tok, '/');
+ left = __expand_token__ (parts{1}, pred_names, p);
+ for i = 2:numel (parts)
+ right = __expand_token__ (parts{i}, pred_names, p);
+ cross = {};
+ for li = 1:numel (left)
+ for ri = 1:numel (right)
+ cross{end+1} = left{li} + right{ri};
+ endfor
+ endfor
+ left = [left, cross];
+ endfor
+ rows = __dedup_rows__ (left);
+ return;
+ endif
+
+ ## Case 5: power ladder (var^N)
+ m = regexp (tok, '^([a-zA-Z_][a-zA-Z0-9_]*)\^(\d+)$', 'tokens');
+ if (! isempty (m))
+ varname = m{1}{1};
+ power = str2double (m{1}{2});
+ idx = find (strcmp (pred_names, varname));
+ if (! isempty (idx))
+ for pwr = 1:power
+ row = zeros (1, p);
+ row(idx) = pwr;
+ rows{end+1} = row;
+ endfor
+ return;
+ endif
+ endif
+
+ ## Case 6: interaction (A:B)
+ if (any (tok == ':'))
+ parts = strtrim (strsplit (tok, ':'));
+ row = zeros (1, p);
+ valid = true;
+ for i = 1:numel (parts)
+ part = parts{i};
+ caret = strfind (part, '^');
+ if (! isempty (caret))
+ base = strtrim (part(1:caret(1)-1));
+ pwr = str2double (part(caret(1)+1:end));
+ else
+ base = part;
+ pwr = 1;
+ endif
+ idx = find (strcmp (pred_names, base));
+ if (isempty (idx))
+ valid = false;
+ break;
+ endif
+ row(idx(1)) = pwr;
+ endfor
+ if (valid)
+ rows = {row};
+ endif
+ return;
+ endif
+
+ ## Case 7: simple variable
+ idx = find (strcmp (pred_names, tok));
+ if (! isempty (idx))
+ row = zeros (1, p);
+ row(idx) = 1;
+ rows = {row};
+ endif
+endfunction
+
+
+## ── Group power expansion ────────────────────────────────────────────────────
+
+function rows = __group_power_expand__ (var_indices, max_power, p)
+ nv = numel (var_indices);
+ all_exps = __enum_exponents__ (nv, max_power);
+ rows = {};
+ for i = 1:size (all_exps, 1)
+ row = zeros (1, p);
+ for j = 1:nv
+ row(var_indices(j)) = all_exps(i, j);
+ endfor
+ rows{end+1} = row;
+ endfor
+endfunction
+
+
+## ── Enumerate exponent vectors ───────────────────────────────────────────────
+
+function E = __enum_exponents__ (nv, max_deg)
+ E = [];
+ total = (max_deg + 1) ^ nv;
+ for i = 0:total-1
+ vec = zeros (1, nv);
+ val = i;
+ for j = 1:nv
+ vec(j) = mod (val, max_deg + 1);
+ val = floor (val / (max_deg + 1));
+ endfor
+ s = sum (vec);
+ if (s >= 1 && s <= max_deg)
+ E = [E; vec];
+ endif
+ endfor
+endfunction
+
+
+## ── Crossing rows (A*B expansion) ────────────────────────────────────────────
+
+function rows = __crossing_rows__ (left, right)
+ cross = {};
+ for li = 1:numel (left)
+ for ri = 1:numel (right)
+ cross{end+1} = left{li} + right{ri};
+ endfor
+ endfor
+ rows = [left, right, cross];
+ rows = __dedup_rows__ (rows);
+endfunction
+
+
+## ── Remove first matching row ────────────────────────────────────────────────
+
+function rows = __remove_row__ (rows, target)
+ for i = 1:numel (rows)
+ if (isequal (rows{i}, target))
+ rows(i) = [];
+ return;
+ endif
+ endfor
+endfunction
+
+
+## ── Deduplicate rows ─────────────────────────────────────────────────────────
+
+function rows = __dedup_rows__ (rows)
+ keep = true (numel (rows), 1);
+ for i = 1:numel (rows)
+ if (! keep(i)), continue; endif
+ for j = i+1:numel (rows)
+ if (keep(j) && isequal (rows{i}, rows{j}))
+ keep(j) = false;
+ endif
+ endfor
+ endfor
+ rows = rows(keep);
+endfunction
+
+
+## ── Check for char at top level (outside parens) ─────────────────────────────
+
+function result = __has_at_toplevel__ (tok, ch)
+ result = false;
+ depth = 0;
+ for i = 1:numel (tok)
+ if (tok(i) == '('), depth++;
+ elseif (tok(i) == ')'), depth--;
+ elseif (depth == 0 && tok(i) == ch)
+ result = true;
+ return;
+ endif
+ endfor
+endfunction
+
+
+## ── Split on char at top level ───────────────────────────────────────────────
+
+function parts = __split_toplevel__ (tok, ch)
+ parts = {};
+ depth = 0;
+ current = '';
+ for i = 1:numel (tok)
+ if (tok(i) == '('), depth++; current = [current, tok(i)];
+ elseif (tok(i) == ')'), depth--; current = [current, tok(i)];
+ elseif (depth == 0 && tok(i) == ch)
+ parts{end+1} = strtrim (current);
+ current = '';
+ else
+ current = [current, tok(i)];
+ endif
+ endfor
+ if (! isempty (current))
+ parts{end+1} = strtrim (current);
+ endif
+endfunction
+
+
+## ── Polynomial shorthand to terms ────────────────────────────────────────────
+
+function terms = __poly_to_terms__ (max_powers, p, has_intercept)
+ max_deg = max (max_powers);
+ exps = cell (1, p);
+ for j = 1:p
+ exps{j} = 0:max_powers(j);
+ endfor
+ grids = cell (1, p);
+ [grids{:}] = ndgrid (exps{:});
+ n_combos = numel (grids{1});
+ all_rows = zeros (n_combos, p);
+ for j = 1:p
+ all_rows(:, j) = grids{j}(:);
+ endfor
+ total_deg = sum (all_rows, 2);
+ keep = (total_deg >= 1) & (total_deg <= max_deg);
+ terms_body = all_rows(keep, :);
+ [~, ord] = sortrows ([sum(terms_body, 2), terms_body]);
+ terms_body = terms_body(ord, :);
+ if (has_intercept)
+ terms = [zeros(1, p); terms_body];
+ else
+ terms = terms_body;
+ endif
+endfunction
diff --git a/inst/private/__fitlm_build_diagnostics__.m b/inst/private/__fitlm_build_diagnostics__.m
new file mode 100644
index 00000000..dae490ff
--- /dev/null
+++ b/inst/private/__fitlm_build_diagnostics__.m
@@ -0,0 +1,182 @@
+## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com>
+##
+## This file is part of the statistics package for GNU Octave.
+##
+## This program is free software; you can redistribute it and/or modify it under
+## the terms of the GNU General Public License as published by the Free Software
+## Foundation; either version 3 of the License, or (at your option) any later
+## version.
+##
+## This program is distributed in the hope that it will be useful, but WITHOUT
+## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+## details.
+##
+## You should have received a copy of the GNU General Public License along with
+## this program; if not, see .
+
+## -*- texinfo -*-
+## @deftypefn {Private Function} {[@var{Fitted}, @var{Residuals}, @var{Diagnostics}, @
+## @var{MFVNM}] =} __fitlm_build_diagnostics__ (@var{s}, @var{X_full}, @
+## @var{y_full}, @var{incl_mask}, @var{excl_mask})
+##
+## Expand lm_fit_engine output from n_used rows to full n rows.
+##
+## Key behaviors confirmed from MATLAB Phase 4 cross-validation:
+## - Excluded rows: Fitted computed (X*beta), Residuals = NaN
+## - Missing rows (NaN in X or y): Fitted = NaN, Residuals = NaN
+## - Diagnostics: Leverage=0, CooksD/Dffits/S2_i/CovRatio=NaN for non-included rows
+##
+## @var{s} is the struct returned by @code{lm_fit_engine}.
+## @var{X_full} is the full n x p_tot design matrix.
+## @var{y_full} is the full n x 1 response (NaN for missing rows).
+## @var{incl_mask} is n x 1 logical: rows used in fit.
+## @var{excl_mask} is n x 1 logical: rows explicitly excluded.
+## @end deftypefn
+
+function [Fitted, Residuals, Diagnostics, MFVNM] = ...
+ __fitlm_build_diagnostics__ (s, X_full, y_full, incl_mask, excl_mask)
+
+ n = rows (X_full);
+ p_est = s.p_est;
+ mse = s.mse;
+ dfe = s.dfe;
+
+ ## ── Fitted ──────────────────────────────────────────────────────────────
+ Fitted = NaN (n, 1);
+
+ ## Included rows
+ Fitted(incl_mask) = s.yhat;
+
+ ## Excluded-but-not-missing rows: compute X * beta
+ missing_mask = ! incl_mask & ! excl_mask; ## true = missing (NaN in data)
+ excl_not_missing = excl_mask & ! missing_mask;
+ if (any (excl_not_missing))
+ Fitted(excl_not_missing) = X_full(excl_not_missing, :) * s.beta;
+ endif
+ ## Missing rows: Fitted stays NaN
+
+ ## ── Residuals (n x 4 struct) ────────────────────────────────────────────
+ raw_r = NaN (n, 1);
+ pear_r = NaN (n, 1);
+ stud_r = NaN (n, 1);
+ std_r = NaN (n, 1);
+
+ ## Leverage for included rows
+ h_inc = s.h; ## n_used x 1
+
+ ## Pearson = raw / sqrt(MSE)
+ ## Standardized = raw / sqrt(MSE * (1 - h))
+ ## Studentized (externally) = raw / sqrt(S2_i * (1 - h))
+ ## where S2_i = (dfe * MSE - raw^2 / (1-h)) / (dfe - 1)
+
+ raw_inc = s.resid;
+
+ raw_r(incl_mask) = raw_inc;
+ pear_r(incl_mask) = raw_inc / sqrt (mse);
+ std_denom = sqrt (mse * (1 - h_inc));
+ std_r(incl_mask) = raw_inc ./ std_denom;
+
+ ## Studentized (leave-one-out MSE)
+ s2_i_inc = max (0, (dfe * mse - raw_inc .^ 2 ./ (1 - h_inc)) / max (1, dfe - 1));
+ stud_denom = sqrt (s2_i_inc .* (1 - h_inc));
+ stud_r_inc = raw_inc ./ stud_denom;
+ stud_r_inc(stud_denom == 0) = 0;
+ stud_r(incl_mask) = stud_r_inc;
+
+ Residuals.Raw = raw_r;
+ Residuals.Pearson = pear_r;
+ Residuals.Studentized = stud_r;
+ Residuals.Standardized = std_r;
+
+ ## ── Diagnostics (n x 7 struct) ──────────────────────────────────────────
+ lev = zeros (n, 1);
+ cooks = NaN (n, 1);
+ dffits_v = NaN (n, 1);
+ s2_i_v = NaN (n, 1);
+ covratio_v = NaN (n, 1);
+ dfbetas_v = zeros (n, p_est);
+ hatmat = zeros (n, n);
+
+ ## Build hat matrix block for included rows
+ Q_inc = s.Q; ## n_used x p_est
+ H_inc = Q_inc * Q_inc'; ## n_used x n_used
+ h_diag = h_inc;
+
+ lev(incl_mask) = h_diag;
+
+ ## Hat matrix: fill in the included rows/cols
+ inc_idx = find (incl_mask);
+ for ii = 1:numel (inc_idx)
+ for jj = 1:numel (inc_idx)
+ hatmat(inc_idx(ii), inc_idx(jj)) = H_inc(ii, jj);
+ endfor
+ endfor
+
+ ## Cook's distance: D_i = (raw_i^2 * h_i) / (p_est * MSE * (1-h_i)^2)
+ h_inc_safe = max (h_inc, eps);
+ cooks_inc = (raw_inc .^ 2 .* h_inc) ./ (p_est * mse * (1 - h_inc) .^ 2);
+ cooks(incl_mask) = cooks_inc;
+
+ ## DFFITS: raw / sqrt(S2_i * h)
+ dffits_inc = raw_inc ./ sqrt (s2_i_inc .* h_inc_safe);
+ dffits_inc(h_inc == 0) = 0;
+ dffits_v(incl_mask) = dffits_inc;
+
+ ## S2_i (leave-one-out variance)
+ s2_i_v(incl_mask) = s2_i_inc;
+
+ ## CovRatio: (1 / (1-h)) ^ p_est * (dfe / (dfe - 1) + raw^2 / (mse * (dfe-1) * (1-h)))^(-p_est)
+ ## Simpler: (S2_i / MSE)^p_est / (1-h)
+ s2_ratio = s2_i_inc / max (mse, eps);
+ covrat_inc = (s2_ratio .^ p_est) ./ (1 - h_inc);
+ covrat_inc(h_inc >= 1) = NaN;
+ covratio_v(incl_mask) = covrat_inc;
+
+ ## DFBETAS: (beta_full - beta_(-i)) / sqrt(S2_i * diag(XtX_inv))
+ ## Approximation via s.Q: dfbetas_i = (Q(i,:)' * resid_i) / (1-h_i) / sqrt(S2_i * diag(vcov))
+ vcov_diag = diag (s.vcov); ## p_est x 1 (from engine vcov sized p_tot x p_tot)
+ if (numel (vcov_diag) > p_est)
+ vcov_diag = vcov_diag(1:p_est);
+ endif
+ se_beta = sqrt (max (vcov_diag, 0)); ## p_est x 1
+
+ for ii = 1:numel (inc_idx)
+ ri = raw_inc(ii);
+ hi = h_inc(ii);
+ s2i = s2_i_inc(ii);
+ qi = Q_inc(ii, :)'; ## p_est x 1
+ denom = (1 - hi) .* sqrt (s2i) .* se_beta;
+ denom(denom == 0) = NaN;
+ dfb = (ri ./ (1 - hi)) * qi ./ sqrt (s2i .* max (vcov_diag, eps));
+ dfbetas_v(inc_idx(ii), 1:p_est) = dfb';
+ endfor
+
+ Diagnostics.Leverage = lev;
+ Diagnostics.CooksDistance = cooks;
+ Diagnostics.Dffits = dffits_v;
+ Diagnostics.S2_i = s2_i_v;
+ Diagnostics.CovRatio = covratio_v;
+ Diagnostics.Dfbetas = dfbetas_v;
+ Diagnostics.HatMatrix = hatmat;
+
+ ## ── ModelFitVsNullModel ─────────────────────────────────────────────────
+ ## Check if model has intercept (intercept row = zeros(1,p) in terms_pred)
+ ## We detect via lp_str starting with '1'
+ has_int = (s.p_est > 0) && (s.ssr > 0 || s.sse > 0);
+
+ ## If model has intercept: compute F-stat and p-value
+ ## If no intercept: all NaN (confirmed Block 10-P4)
+ ##
+ ## We check via DFR = p_est - 1 if intercept, else p_est
+ ## However we don't have has_intercept here. Use heuristic:
+ %% detect from sst — if sst == sse+ssr and dfe = n-p_est, we can check
+ %% whether the design matrix has an intercept by seeing if beta(1) was ~unconstrained
+ %% Simplest: pass intercept flag when calling this function.
+ %% Since we don't have it here, let the caller override MFVNM.
+
+ MFVNM.Fstat = NaN;
+ MFVNM.Pvalue = NaN;
+ MFVNM.NullModel = NaN;
+
+endfunction
diff --git a/inst/private/__fitlm_build_formula__.m b/inst/private/__fitlm_build_formula__.m
new file mode 100644
index 00000000..b45a6d70
--- /dev/null
+++ b/inst/private/__fitlm_build_formula__.m
@@ -0,0 +1,87 @@
+## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com>
+##
+## This file is part of the statistics package for GNU Octave.
+##
+## This program is free software; you can redistribute it and/or modify it under
+## the terms of the GNU General Public License as published by the Free Software
+## Foundation; either version 3 of the License, or (at your option) any later
+## version.
+##
+## This program is distributed in the hope that it will be useful, but WITHOUT
+## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+## details.
+##
+## You should have received a copy of the GNU General Public License along with
+## this program; if not, see .
+
+## -*- texinfo -*-
+## @deftypefn {Private Function} {@var{Formula} =} __fitlm_build_formula__ (@
+## @var{terms_mat}, @var{var_names}, @var{pred_names}, @var{resp_name}, @
+## @var{has_intercept}, @var{lp_str}, @var{coef_names})
+##
+## Build the Formula struct for a LinearModel.
+##
+## @var{terms_mat} is the (n_terms x n_all_vars) binary matrix including
+## the response column (always 0).
+##
+## Returns a struct with the 14 confirmed fields from MATLAB Block 1-P4.
+## @end deftypefn
+
+function Formula = __fitlm_build_formula__ (terms_mat, var_names, pred_names, ...
+ resp_name, has_intercept, lp_str, coef_names)
+
+ n_terms = rows (terms_mat);
+ n_vars = columns (terms_mat); ## includes response column
+ n_pred = numel (pred_names);
+
+ ## Build TermNames from terms_mat (using pred columns only)
+ terms_pred = terms_mat(:, 1:n_pred);
+ term_names = cell (n_terms, 1);
+ for i = 1:n_terms
+ row = terms_pred(i, :);
+ if (all (row == 0))
+ term_names{i} = "(Intercept)";
+ else
+ active = find (row);
+ parts = {};
+ for j = 1:numel (active)
+ v = active(j);
+ pwr = row(v);
+ if (pwr == 1)
+ parts{end+1} = pred_names{v};
+ else
+ parts{end+1} = sprintf ("%s^%d", pred_names{v}, pwr);
+ endif
+ endfor
+ term_names{i} = strjoin (parts, ":");
+ endif
+ endfor
+
+ ## InModel: logical vector over var_names (1 = predictor in model, 0 = response)
+ ## Length = n_vars. Response column is always 0. A predictor is InModel if
+ ## any non-intercept term has a non-zero entry in that predictor's column.
+ in_model = false (1, n_vars);
+ for j = 1:n_pred
+ if (any (terms_pred(:, j) != 0))
+ in_model(j) = true;
+ endif
+ endfor
+ ## Response column (last) = 0 (already false)
+
+ Formula.LinearPredictor = lp_str;
+ Formula.ResponseName = resp_name;
+ Formula.Terms = terms_mat;
+ Formula.VariableNames = var_names; ## e.g. {'x1','x2','x3','y'}
+ Formula.HasIntercept = logical (has_intercept);
+ Formula.InModel = in_model;
+ Formula.TermNames = term_names;
+ Formula.PredictorNames = pred_names;
+ Formula.NTerms = n_terms;
+ Formula.NVars = n_vars;
+ Formula.NPredictors = n_pred;
+ Formula.Link = "identity";
+ Formula.ModelFun = @(b, X) X * b;
+ Formula.FunctionCalls = {};
+
+endfunction
diff --git a/inst/private/__fitlm_build_obsinfo__.m b/inst/private/__fitlm_build_obsinfo__.m
new file mode 100644
index 00000000..622350ed
--- /dev/null
+++ b/inst/private/__fitlm_build_obsinfo__.m
@@ -0,0 +1,46 @@
+## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com>
+##
+## This file is part of the statistics package for GNU Octave.
+##
+## This program is free software; you can redistribute it and/or modify it under
+## the terms of the GNU General Public License as published by the Free Software
+## Foundation; either version 3 of the License, or (at your option) any later
+## version.
+##
+## This program is distributed in the hope that it will be useful, but WITHOUT
+## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+## details.
+##
+## You should have received a copy of the GNU General Public License along with
+## this program; if not, see .
+
+## -*- texinfo -*-
+## @deftypefn {Private Function} {@var{ObservationInfo} =} __fitlm_build_obsinfo__ (@
+## @var{weights}, @var{excl_mask}, @var{raw_X}, @var{raw_y}, @var{obs_names})
+##
+## Build the ObservationInfo table for a LinearModel.
+##
+## Confirmed column names from MATLAB Block 7-P4:
+## {Weights, Excluded, Missing, Subset}
+## Subset = true means the row was used in the fit (not excluded, not missing).
+## @end deftypefn
+
+function ObservationInfo = __fitlm_build_obsinfo__ (weights, excl_mask, raw_X, raw_y, obs_names)
+
+ n = rows (raw_y);
+
+ ## Missing: any NaN in X rows or y
+ nan_in_X = any (isnan (raw_X), 2);
+ nan_in_y = isnan (raw_y);
+ missing = logical (nan_in_X | nan_in_y);
+
+ excluded = logical (excl_mask(:));
+ subset = (!excluded) & (!missing);
+
+ ObservationInfo.Weights = weights(:);
+ ObservationInfo.Excluded = excluded;
+ ObservationInfo.Missing = missing;
+ ObservationInfo.Subset = subset;
+
+endfunction
diff --git a/inst/private/__fitlm_build_varinfo__.m b/inst/private/__fitlm_build_varinfo__.m
new file mode 100644
index 00000000..d802a6f7
--- /dev/null
+++ b/inst/private/__fitlm_build_varinfo__.m
@@ -0,0 +1,101 @@
+## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com>
+##
+## This file is part of the statistics package for GNU Octave.
+##
+## This program is free software; you can redistribute it and/or modify it under
+## the terms of the GNU General Public License as published by the Free Software
+## Foundation; either version 3 of the License, or (at your option) any later
+## version.
+##
+## This program is distributed in the hope that it will be useful, but WITHOUT
+## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+## details.
+##
+## You should have received a copy of the GNU General Public License along with
+## this program; if not, see .
+
+## -*- texinfo -*-
+## @deftypefn {Private Function} {@var{VariableInfo} =} __fitlm_build_varinfo__ (@
+## @var{var_names}, @var{cat_flags}, @var{raw_cols}, @var{in_model})
+##
+## Build the VariableInfo struct for a LinearModel.
+##
+## Confirmed from MATLAB Block 2-P4:
+## VariableInfo is a STRUCT (mirroring table layout) with fields:
+## Class - cell of char: 'double', 'categorical', etc.
+## Range - cell of cell: {[min max]} for numeric, {[sorted_uniq]} for categorical
+## InModel - logical column vector
+## IsCategorical - logical column vector
+##
+## Row names are stored separately (= var_names).
+##
+## @var{in_model} is a logical row vector of length numel(var_names): true for
+## each predictor that appears in the model (false for response).
+## @end deftypefn
+
+function VariableInfo = __fitlm_build_varinfo__ (var_names, cat_flags, raw_cols, in_model)
+
+ n_vars = numel (var_names);
+
+ Class = cell (n_vars, 1);
+ Range = cell (n_vars, 1);
+ InModel = false (n_vars, 1);
+ IsCategorical = false (n_vars, 1);
+
+ for j = 1:n_vars
+ col = raw_cols{j};
+
+ ## IsCategorical: from cat_flags (set by CategoricalVars or type detection)
+ IsCategorical(j) = logical (cat_flags(j));
+
+ ## Class
+ if (isa (col, "categorical"))
+ Class{j} = "categorical";
+ elseif (iscellstr (col) || isstring (col))
+ Class{j} = "cell";
+ elseif (isnumeric (col))
+ Class{j} = "double";
+ else
+ Class{j} = class (col);
+ endif
+
+ ## Range
+ if (IsCategorical(j))
+ ## Sorted unique levels
+ if (isa (col, "categorical"))
+ levs = cellstr (categories (col));
+ elseif (iscellstr (col))
+ levs = unique (col);
+ elseif (isstring (col))
+ levs = unique (cellstr (col));
+ else
+ ## Numeric forced categorical — unique numeric values
+ uvals = unique (col(!isnan (col)));
+ levs = uvals(:)';
+ endif
+ Range{j} = {levs};
+ else
+ ## Numeric — [min max]
+ if (isnumeric (col))
+ valid = col(!isnan (col));
+ if (isempty (valid))
+ Range{j} = {[NaN NaN]};
+ else
+ Range{j} = {[min(valid), max(valid)]};
+ endif
+ else
+ Range{j} = {[]};
+ endif
+ endif
+
+ ## InModel
+ InModel(j) = logical (in_model(j));
+ endfor
+
+ VariableInfo.Class = Class;
+ VariableInfo.Range = Range;
+ VariableInfo.InModel = InModel;
+ VariableInfo.IsCategorical = IsCategorical;
+
+endfunction
diff --git a/inst/private/__fitlm_parse_inputs__.m b/inst/private/__fitlm_parse_inputs__.m
new file mode 100644
index 00000000..b7974e9b
--- /dev/null
+++ b/inst/private/__fitlm_parse_inputs__.m
@@ -0,0 +1,347 @@
+## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com>
+##
+## This file is part of the statistics package for GNU Octave.
+##
+## This program is free software; you can redistribute it and/or modify it under
+## the terms of the GNU General Public License as published by the Free Software
+## Foundation; either version 3 of the License, or (at your option) any later
+## version.
+##
+## This program is distributed in the hope that it will be useful, but WITHOUT
+## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+## details.
+##
+## You should have received a copy of the GNU General Public License along with
+## this program; if not, see .
+
+## -*- texinfo -*-
+## @deftypefn {Private Function} {[@var{raw_X}, @var{raw_y}, @var{var_names}, @
+## @var{cat_flags}, @var{weights}, @var{excl_mask}, @var{modelspec}, @
+## @var{obs_names}, @var{resp_name}, @var{pred_names}, @var{has_intercept}] =} @
+## __fitlm_parse_inputs__ (@var{varargin})
+##
+## Parse fitlm inputs into clean typed outputs.
+##
+## Returns:
+## raw_X - n x p numeric matrix (NaN in missing rows preserved)
+## raw_y - n x 1 numeric response vector
+## var_names - 1 x (p+1) cell of char: {pred_names..., resp_name}
+## cat_flags - 1 x (p+1) logical: IsCategorical for each variable
+## weights - n x 1 double (1.0 if unweighted)
+## excl_mask - n x 1 logical (true = explicitly excluded)
+## modelspec - char: Wilkinson formula string OR shorthand string
+## obs_names - cell of char (row names or {})
+## resp_name - char: response variable name
+## pred_names - 1 x p cell of char: predictor variable names
+## has_intercept - logical: true unless explicitly suppressed
+## raw_cols - cell of original columns (for VariableInfo building)
+## @end deftypefn
+
+function [raw_X, raw_y, var_names, cat_flags, weights, excl_mask, ...
+ modelspec, obs_names, resp_name, pred_names, has_intercept, ...
+ raw_cols, dummy_coding] = ...
+ __fitlm_parse_inputs__ (varargin)
+
+ if (numel (varargin) < 1)
+ error ("fitlm: at least one input argument required.");
+ endif
+
+ ## --- Detect first argument: table or matrix ---
+ is_table_input = isa (varargin{1}, "table");
+
+ ## Initialize name-value defaults
+ weights_arg = [];
+ excl_arg = [];
+ intercept_arg = true;
+ cat_vars_arg = []; ## integer indices or {}
+ var_names_arg = {}; ## VarNames override
+ resp_var_arg = ""; ## ResponseVar override
+ pred_vars_arg = {}; ## PredictorVars override
+
+ if (is_table_input)
+ ## ── TABLE INPUT ──────────────────────────────────────────────────────────
+ tbl = varargin{1};
+ varargin(1) = [];
+
+ ## Extract table metadata
+ all_col_names = tbl.Properties.VariableNames;
+ n_vars_tbl = numel (all_col_names);
+
+ ## ObservationNames from row names
+ obs_names = tbl.Properties.RowNames; ## cell or {} if unset
+
+ ## Parse optional modelspec (2nd positional arg if char/not a NV key,
+ ## or a numeric terms matrix)
+ modelspec = "linear"; ## default shorthand
+ if (! isempty (varargin) && isnumeric (varargin{1}) && ...
+ ismatrix (varargin{1}) && ndims (varargin{1}) == 2)
+ ## Numeric terms matrix passed directly
+ modelspec = varargin{1};
+ varargin(1) = [];
+ elseif (! isempty (varargin) && ischar (varargin{1}) && ...
+ ! __is_nv_key__ (varargin{1}))
+ modelspec = varargin{1};
+ varargin(1) = [];
+ endif
+
+ ## Parse name-value pairs
+ [weights_arg, excl_arg, intercept_arg, cat_vars_arg, var_names_arg, ...
+ resp_var_arg, pred_vars_arg, dummy_coding_arg] = __parse_nv__ (varargin{:});
+
+ ## Determine response and predictor variables
+ if (! isempty (resp_var_arg))
+ resp_name = resp_var_arg;
+ elseif (! isempty (strfind (modelspec, "~")))
+ ## Formula string — response is LHS
+ tilde_pos = strfind (modelspec, "~");
+ resp_name = strtrim (modelspec(1:tilde_pos(1)-1));
+ else
+ ## Default: last column of table
+ resp_name = all_col_names{end};
+ endif
+
+ ## Predictor names
+ if (! isempty (pred_vars_arg))
+ if (iscell (pred_vars_arg))
+ pred_names = pred_vars_arg;
+ else
+ pred_names = all_col_names(pred_vars_arg);
+ endif
+ else
+ ## All columns except response
+ pred_names = all_col_names(! strcmp (all_col_names, resp_name));
+ endif
+
+ ## var_names: predictor order then response (matches MATLAB Block 4-P4)
+ var_names = [pred_names, {resp_name}];
+
+ ## Extract raw columns
+ n = rows (tbl);
+ p = numel (pred_names);
+ raw_X = zeros (n, p);
+ raw_cols = cell (1, p + 1);
+
+ cat_flags = false (1, p + 1);
+
+ ## Handle CategoricalVars — may be names or indices into pred_names
+ cat_idx_set = __resolve_cat_vars__ (cat_vars_arg, pred_names);
+
+ for k = 1:p
+ col = tbl.(pred_names{k});
+ raw_cols{k} = col;
+ if (isa (col, "categorical"))
+ ## Store numeric codes — raw_X stays as codes for design builder
+ cat_flags(k) = true;
+ raw_X(:, k) = double (col); ## integer codes (1-based after sort)
+ elseif (iscellstr (col) || isstring (col))
+ cat_flags(k) = true;
+ raw_X(:, k) = NaN; ## placeholder; design builder uses raw_cols
+ elseif (ismember (k, cat_idx_set))
+ cat_flags(k) = true;
+ raw_X(:, k) = col;
+ else
+ raw_X(:, k) = col;
+ endif
+ endfor
+
+ ## Response column
+ raw_y = tbl.(resp_name);
+ raw_y = raw_y(:);
+ raw_cols{p+1} = raw_y;
+
+ ## cat_flags for response (always false)
+ cat_flags(p+1) = false;
+
+ ## No VarNames override for table input
+
+ else
+ ## ── MATRIX INPUT ─────────────────────────────────────────────────────────
+ X_in = varargin{1};
+ if (nargin < 2 || numel (varargin) < 2)
+ error ("fitlm: matrix input requires at least X and y arguments.");
+ endif
+ y_in = varargin{2}(:);
+ varargin(1:2) = [];
+
+ n = rows (X_in);
+ p = columns (X_in);
+ obs_names = {};
+
+ ## Parse optional modelspec (char string, shorthand, or numeric terms matrix)
+ modelspec = "linear";
+ if (! isempty (varargin) && isnumeric (varargin{1}) && ...
+ ismatrix (varargin{1}) && ndims (varargin{1}) == 2)
+ ## Numeric terms matrix passed directly
+ modelspec = varargin{1};
+ varargin(1) = [];
+ elseif (! isempty (varargin) && ischar (varargin{1}) && ...
+ ! __is_nv_key__ (varargin{1}))
+ modelspec = varargin{1};
+ varargin(1) = [];
+ endif
+
+ ## Parse name-value pairs
+ [weights_arg, excl_arg, intercept_arg, cat_vars_arg, var_names_arg, ...
+ resp_var_arg, pred_vars_arg, dummy_coding_arg] = __parse_nv__ (varargin{:});
+
+ ## Variable names
+ if (! isempty (var_names_arg))
+ ## VarNames override: last = response, rest = predictors
+ if (numel (var_names_arg) != p + 1)
+ error ("fitlm: VarNames must have %d elements (p predictors + 1 response).", p+1);
+ endif
+ pred_names = var_names_arg(1:p);
+ resp_name = var_names_arg{p+1};
+ else
+ ## Auto-generate: x1...xp and y
+ pred_names = arrayfun (@(j) sprintf ("x%d", j), 1:p, "UniformOutput", false);
+ resp_name = "y";
+ endif
+
+ var_names = [pred_names, {resp_name}];
+
+ raw_X = X_in;
+ raw_y = y_in;
+
+ ## cat_flags — start all false, then mark CategoricalVars
+ cat_flags = false (1, p + 1);
+ cat_idx_set = __resolve_cat_vars__ (cat_vars_arg, pred_names);
+ if (! isempty (cat_idx_set))
+ cat_flags(cat_idx_set) = true;
+ endif
+
+ ## raw_cols for VariableInfo building
+ raw_cols = cell (1, p + 1);
+ for k = 1:p
+ raw_cols{k} = raw_X(:, k);
+ endfor
+ raw_cols{p+1} = raw_y;
+
+ endif ## is_table_input
+
+ ## --- Weights ---
+ if (isempty (weights_arg))
+ weights = ones (n, 1);
+ else
+ weights = weights_arg(:);
+ if (numel (weights) != n)
+ error ("fitlm: Weights must have %d elements.", n);
+ endif
+ endif
+
+ ## --- Exclusion mask ---
+ excl_mask = false (n, 1);
+ if (! isempty (excl_arg))
+ if (islogical (excl_arg))
+ excl_mask = logical (excl_arg(:));
+ else
+ idx = round (excl_arg(:));
+ excl_mask(idx) = true;
+ endif
+ endif
+
+ ## --- Intercept ---
+ has_intercept = logical (intercept_arg);
+
+ ## If modelspec is a formula string, extract has_intercept from it
+ if (ischar (modelspec) && ! isempty (strfind (modelspec, "~")))
+ has_intercept = isempty (regexp (modelspec, '-\s*1'));
+ if (! has_intercept)
+ ## '-1' suppresses intercept; strip it for later parsing
+ endif
+ endif
+
+
+ ## DummyVarCoding
+ dummy_coding = dummy_coding_arg;
+
+ ## If Intercept=false was explicitly given, override
+ if (! intercept_arg)
+ has_intercept = false;
+ endif
+
+endfunction
+
+## ── Internal helpers ─────────────────────────────────────────────────────────
+
+function result = __is_nv_key__ (s)
+ ## Return true if s is a known name-value argument name.
+ ## Everything that is NOT a known key and contains '~' or is a shorthand
+ ## is passed as modelspec.
+ known = {"weights", "exclude", "intercept", "categoricalvars", ...
+ "varnames", "robustopts", "responsevar", "predictorvars", ...
+ "dummyvarcoding"};
+ result = any (strcmpi (s, known));
+endfunction
+
+function [weights_arg, excl_arg, intercept_arg, cat_vars_arg, var_names_arg, ...
+ resp_var_arg, pred_vars_arg, dummy_coding_arg] = __parse_nv__ (varargin)
+ weights_arg = [];
+ excl_arg = [];
+ intercept_arg = true;
+ cat_vars_arg = [];
+ var_names_arg = {};
+ resp_var_arg = "";
+ pred_vars_arg = {};
+ dummy_coding_arg = "reference";
+
+ i = 1;
+ while (i <= numel (varargin))
+ if (! ischar (varargin{i}))
+ error ("fitlm: expected name-value argument name at position %d.", i);
+ endif
+ key = lower (varargin{i});
+ if (i + 1 > numel (varargin))
+ error ("fitlm: name-value argument '%s' has no value.", varargin{i});
+ endif
+ val = varargin{i+1};
+ switch (key)
+ case "weights"
+ weights_arg = val;
+ case "exclude"
+ excl_arg = val;
+ case "intercept"
+ intercept_arg = logical (val);
+ case "categoricalvars"
+ cat_vars_arg = val;
+ case "varnames"
+ var_names_arg = val;
+ case "robustopts"
+ ## Accepted but not implemented in Phase 4
+ case "dummyvarcoding"
+ dummy_coding_arg = lower (val);
+ case "responsevar"
+ resp_var_arg = val;
+ case "predictorvars"
+ pred_vars_arg = val;
+ otherwise
+ warning ("fitlm: ignoring unknown name-value argument '%s'.", varargin{i});
+ endswitch
+ i = i + 2;
+ endwhile
+endfunction
+
+function idx_set = __resolve_cat_vars__ (cat_vars_arg, pred_names)
+ ## Return integer indices (into pred_names) of categorical predictors.
+ idx_set = [];
+ if (isempty (cat_vars_arg))
+ return;
+ endif
+ p = numel (pred_names);
+ if (isnumeric (cat_vars_arg))
+ idx_set = round (cat_vars_arg(:))';
+ elseif (iscell (cat_vars_arg))
+ for k = 1:numel (cat_vars_arg)
+ m = find (strcmp (pred_names, cat_vars_arg{k}));
+ if (! isempty (m))
+ idx_set(end+1) = m;
+ endif
+ endfor
+ elseif (ischar (cat_vars_arg))
+ m = find (strcmp (pred_names, cat_vars_arg));
+ if (! isempty (m))
+ idx_set = m;
+ endif
+ endif
+endfunction
diff --git a/inst/private/__stepwiselm_candidates__.m b/inst/private/__stepwiselm_candidates__.m
new file mode 100644
index 00000000..de8d92ef
--- /dev/null
+++ b/inst/private/__stepwiselm_candidates__.m
@@ -0,0 +1,219 @@
+## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com>
+##
+## This file is part of the statistics package for GNU Octave.
+##
+## This program is free software; you can redistribute it and/or modify it under
+## the terms of the GNU General Public License as published by the Free Software
+## Foundation; either version 3 of the License, or (at your option) any later
+## version.
+##
+## This program is distributed in the hope that it will be useful, but WITHOUT
+## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+## details.
+##
+## You should have received a copy of the GNU General Public License along with
+## this program; if not, see .
+
+## -*- texinfo -*-
+## @deftypefn {Private Function} {@var{candidates} =} __stepwiselm_candidates__ @
+## (@var{current_terms}, @var{lower_terms}, @var{upper_terms}, @
+## @var{cat_flags}, @var{pred_names})
+##
+## Enumerate candidate terms for adding or removing in stepwise regression.
+##
+## @strong{Inputs}
+##
+## @table @var
+## @item current_terms
+## Binary terms matrix of the current model (n_terms x n_vars).
+## Includes the response column (last column, always 0).
+## @item lower_terms
+## Binary terms matrix of the lower (minimum) model.
+## @item upper_terms
+## Binary terms matrix of the upper (maximum) model.
+## @item cat_flags
+## Logical vector (1 x n_vars) indicating categorical variables.
+## @item pred_names
+## Cell array of predictor variable names.
+## @end table
+##
+## @strong{Output}
+##
+## @var{candidates} is a struct array. Each element has fields:
+## @table @code
+## @item term_row
+## Binary row vector (1 x n_vars) for this term.
+## @item term_name
+## Char string: e.g., @code{'x1'}, @code{'x1:x2'}, @code{'Species'}.
+## @item action
+## @code{'add'} or @code{'remove'}.
+## @item df_term
+## Number of design-matrix columns this term contributes:
+## 1 for numeric, k-1 for k-level categorical, products for interactions.
+## @end table
+##
+## @seealso{stepwiselm, __stepwiselm_ftest__}
+## @end deftypefn
+
+function candidates = __stepwiselm_candidates__ (current_terms, lower_terms, ...
+ upper_terms, cat_flags, ...
+ pred_names)
+
+ candidates = struct ("term_row", {}, "term_name", {}, ...
+ "action", {}, "df_term", {});
+
+ p = numel (pred_names); ## number of predictors (excl. response)
+ n_vars = columns (current_terms); ## p + 1 (includes response col)
+
+ ## ── REMOVE candidates ─────────────────────────────────────────────────────
+ ## Terms in current_terms that are NOT in lower_terms and NOT the intercept.
+ for i = 1:rows (current_terms)
+ row = current_terms(i, :);
+
+ ## Skip intercept (all-zeros row)
+ if (all (row == 0))
+ continue;
+ endif
+
+ ## Check if this term is in lower_terms (protected)
+ is_lower = false;
+ for j = 1:rows (lower_terms)
+ if (isequal (row, lower_terms(j, :)))
+ is_lower = true;
+ break;
+ endif
+ endfor
+ if (is_lower)
+ continue;
+ endif
+
+ ## This is a valid removal candidate
+ c.term_row = row;
+ c.term_name = __term_row_to_name__ (row, pred_names, p);
+ c.action = "remove";
+ c.df_term = __compute_df_term__ (row, cat_flags, pred_names, p);
+ candidates(end+1) = c;
+ endfor
+
+ ## ── ADD candidates ────────────────────────────────────────────────────────
+ ## Terms in upper_terms that are NOT in current_terms.
+ for i = 1:rows (upper_terms)
+ row = upper_terms(i, :);
+
+ ## Skip intercept (it's always in the model)
+ if (all (row == 0))
+ continue;
+ endif
+
+ ## Check if already in current_terms
+ already_in = false;
+ for j = 1:rows (current_terms)
+ if (isequal (row, current_terms(j, :)))
+ already_in = true;
+ break;
+ endif
+ endfor
+ if (already_in)
+ continue;
+ endif
+
+ ## Valid add candidate
+ c.term_row = row;
+ c.term_name = __term_row_to_name__ (row, pred_names, p);
+ c.action = "add";
+ c.df_term = __compute_df_term__ (row, cat_flags, pred_names, p);
+ candidates(end+1) = c;
+ endfor
+
+endfunction
+
+
+## ── Helper: term row → human-readable name ──────────────────────────────────
+
+function name = __term_row_to_name__ (row, pred_names, p)
+ ## Convert a terms matrix row (1 x n_vars) to a human-readable term name
+ ## like 'x1', 'x1:x2', 'Species', 'x1^2'.
+ ##
+ ## The row may have width = p or p+1 (with response column).
+ ## We only look at columns 1..p.
+
+ active = find (row(1:p));
+ if (isempty (active))
+ name = "1"; ## intercept
+ return;
+ endif
+
+ parts = cell (1, numel (active));
+ for k = 1:numel (active)
+ v = active(k);
+ pwr = row(v);
+ if (pwr == 1)
+ parts{k} = pred_names{v};
+ else
+ parts{k} = sprintf ("%s^%d", pred_names{v}, pwr);
+ endif
+ endfor
+ name = strjoin (parts, ":");
+endfunction
+
+
+## ── Helper: compute df_term ─────────────────────────────────────────────────
+
+function df = __compute_df_term__ (row, cat_flags, pred_names, p)
+ ## Compute number of design matrix columns for a term.
+ ##
+ ## Rules (§8 of reference doc):
+ ## Numeric predictor: df = 1
+ ## Categorical (k levels): df = k - 1
+ ## Interaction num × num: df = 1
+ ## Interaction num × cat(k): df = k - 1
+ ## Interaction cat(j) × cat(k): df = (j-1) × (k-1)
+ ## Squared numeric (x^2): df = 1
+
+ active = find (row(1:p));
+ if (isempty (active))
+ df = 1; ## intercept
+ return;
+ endif
+
+ df = 1;
+ for k = 1:numel (active)
+ v = active(k);
+ if (cat_flags(v))
+ ## Categorical: need to determine number of levels.
+ ## Since we don't have the data here, we need to infer from context.
+ ## For now, store a sentinel value of 0 to be resolved later.
+ ## Actually, we need the level count. We'll require it as external info.
+ ## For the initial implementation, use _PriorSteps mechanism or pass
+ ## level info. But cat_flags alone doesn't tell us k.
+ ##
+ ## DESIGN DECISION: cat_flags is a logical vector. We extend it to
+ ## carry level counts for categorical variables. We pass cat_nlevels
+ ## as an additional argument or embed it in cat_flags (negative = n_levels).
+ ##
+ ## SIMPLER APPROACH: accept cat_flags as a numeric vector where:
+ ## 0 = numeric, k = number of levels for categorical.
+ ## This is more information-dense.
+ ##
+ ## For now, when cat_flags is logical (boolean), we default to df=1
+ ## and let the F-test function compute df dynamically from the design
+ ## matrix column count difference. This is actually the correct approach
+ ## because df_term is verified at F-test time.
+ ## cat_flags can be:
+ ## - logical true: we default df contribution for this factor = 1
+ ## (will be corrected in __stepwiselm_ftest__ using actual design matrix)
+ ## - numeric > 1: number of levels, contribution = val - 1
+ if (isnumeric (cat_flags) && cat_flags(v) > 1)
+ df = df * (cat_flags(v) - 1);
+ else
+ ## Default: will be resolved by design matrix column counting
+ ## in __stepwiselm_ftest__
+ df = df * 1;
+ endif
+ else
+ ## Numeric: 1 column regardless of power
+ df = df * 1;
+ endif
+ endfor
+endfunction
diff --git a/inst/private/__stepwiselm_ftest__.m b/inst/private/__stepwiselm_ftest__.m
new file mode 100644
index 00000000..00b110c2
--- /dev/null
+++ b/inst/private/__stepwiselm_ftest__.m
@@ -0,0 +1,202 @@
+## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com>
+##
+## This file is part of the statistics package for GNU Octave.
+##
+## This program is free software; you can redistribute it and/or modify it under
+## the terms of the GNU General Public License as published by the Free Software
+## Foundation; either version 3 of the License, or (at your option) any later
+## version.
+##
+## This program is distributed in the hope that it will be useful, but WITHOUT
+## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+## details.
+##
+## You should have received a copy of the GNU General Public License along with
+## this program; if not, see .
+
+## -*- texinfo -*-
+## @deftypefn {Private Function} {[@var{fstat}, @var{pval}, @var{sse_new}, @
+## @var{mse_new}, @var{df_term}, @var{crit_new}] =} @
+## __stepwiselm_ftest__ (@var{raw_X}, @var{raw_y}, @var{weights}, @
+## @var{incl_mask}, @var{current_terms}, @var{candidate}, @
+## @var{cat_flags}, @var{var_names}, @var{pred_names}, @
+## @var{raw_cols}, @var{has_intercept}, @var{criterion})
+##
+## Compute the partial F-test statistic for a candidate term action.
+##
+## This function builds the design matrix for the candidate model using
+## @code{__fitlm_build_design__} and calls @code{lm_fit_engine} directly
+## (not @code{fitlm}) for maximum performance inside the stepwise loop.
+##
+## @strong{F-test formula (confirmed §7)}
+##
+## For @strong{adding} a term:
+## @example
+## F = (SSE_current - SSE_augmented) / df_term / MSE_augmented
+## p = 1 - fcdf(F, df_term, DFE_augmented)
+## @end example
+##
+## For @strong{removing} a term:
+## @example
+## F = (SSE_reduced - SSE_current) / abs(df_term) / MSE_current
+## p = 1 - fcdf(F, abs(df_term), DFE_current)
+## @end example
+##
+## Key rule: the denominator always uses MSE of the @strong{larger}
+## (more complex) model.
+##
+## @seealso{stepwiselm, __stepwiselm_candidates__, lm_fit_engine}
+## @end deftypefn
+
+function [fstat, pval, sse_new, mse_new, df_term, crit_new] = ...
+ __stepwiselm_ftest__ (raw_X, raw_y, weights, incl_mask, ...
+ current_terms, candidate, cat_flags, ...
+ var_names, pred_names, raw_cols, ...
+ has_intercept, criterion)
+
+ n_pred = numel (pred_names);
+ excl_mask = ! incl_mask; ## __fitlm_build_design__ expects excl_mask
+
+ ## ── Build design matrices for both current and candidate models ──────────
+
+ if (strcmp (candidate.action, "add"))
+ ## Adding term: current model is smaller, candidate model is larger
+ new_terms = [current_terms; candidate.term_row];
+
+ ## Sort by term order (number of active predictors) for clean design
+ term_order = sum (new_terms(:, 1:n_pred), 2);
+ [~, sort_idx] = sort (term_order);
+ new_terms = new_terms(sort_idx, :);
+
+ ## Build design for current model (smaller)
+ [~, ~, X_curr, ~, ~, ~, ~, ~] = ...
+ __fitlm_build_design__ (raw_X, raw_y, current_terms, var_names, ...
+ cat_flags, weights, excl_mask, pred_names, ...
+ raw_cols, has_intercept);
+
+ ## Build design for augmented model (larger)
+ [~, ~, X_new, ~, ~, ~, ~, ~] = ...
+ __fitlm_build_design__ (raw_X, raw_y, new_terms, var_names, ...
+ cat_flags, weights, excl_mask, pred_names, ...
+ raw_cols, has_intercept);
+
+ ## Extract included rows
+ X_curr_incl = X_curr(incl_mask, :);
+ X_new_incl = X_new(incl_mask, :);
+ y_incl = raw_y(incl_mask);
+ w_incl = weights(incl_mask);
+
+ ## Fit both models
+ warning ("off", "CompactLinearModel:rankDeficient");
+ s_curr = lm_fit_engine (X_curr_incl, y_incl, w_incl);
+ s_new = lm_fit_engine (X_new_incl, y_incl, w_incl);
+ warning ("on", "CompactLinearModel:rankDeficient");
+
+ ## df_term = difference in design matrix columns (correct for categoricals)
+ df_term = columns (X_new_incl) - columns (X_curr_incl);
+ if (df_term <= 0)
+ df_term = 1; ## safety fallback
+ endif
+
+ ## F = (SSE_curr - SSE_new) / df / MSE_new
+ ## Denominator is MSE of the LARGER (augmented) model
+ if (s_new.mse > 0 && df_term > 0)
+ fstat = (s_curr.sse - s_new.sse) / df_term / s_new.mse;
+ pval = 1 - fcdf (fstat, df_term, s_new.dfe);
+ else
+ fstat = 0;
+ pval = 1;
+ endif
+
+ sse_new = s_new.sse;
+ mse_new = s_new.mse;
+ crit_new = __compute_criterion__ (s_new, criterion);
+
+ else
+ ## Removing term: current model is larger, candidate model is smaller
+ ## Build reduced model (without the term)
+ red_terms = __remove_term_row__ (current_terms, candidate.term_row);
+
+ ## Build design for current model (larger)
+ [~, ~, X_curr, ~, ~, ~, ~, ~] = ...
+ __fitlm_build_design__ (raw_X, raw_y, current_terms, var_names, ...
+ cat_flags, weights, excl_mask, pred_names, ...
+ raw_cols, has_intercept);
+
+ ## Build design for reduced model (smaller)
+ [~, ~, X_red, ~, ~, ~, ~, ~] = ...
+ __fitlm_build_design__ (raw_X, raw_y, red_terms, var_names, ...
+ cat_flags, weights, excl_mask, pred_names, ...
+ raw_cols, has_intercept);
+
+ ## Extract included rows
+ X_curr_incl = X_curr(incl_mask, :);
+ X_red_incl = X_red(incl_mask, :);
+ y_incl = raw_y(incl_mask);
+ w_incl = weights(incl_mask);
+
+ ## Fit both models
+ warning ("off", "CompactLinearModel:rankDeficient");
+ s_curr = lm_fit_engine (X_curr_incl, y_incl, w_incl);
+ s_red = lm_fit_engine (X_red_incl, y_incl, w_incl);
+ warning ("on", "CompactLinearModel:rankDeficient");
+
+ ## df_term = difference in design matrix columns
+ df_term = columns (X_curr_incl) - columns (X_red_incl);
+ if (df_term <= 0)
+ df_term = 1;
+ endif
+
+ ## F = (SSE_red - SSE_curr) / df / MSE_curr
+ ## Denominator is MSE of the LARGER (current) model
+ if (s_curr.mse > 0 && df_term > 0)
+ fstat = (s_red.sse - s_curr.sse) / df_term / s_curr.mse;
+ pval = 1 - fcdf (fstat, df_term, s_curr.dfe);
+ else
+ fstat = 0;
+ pval = 1;
+ endif
+
+ sse_new = s_red.sse;
+ mse_new = s_red.mse;
+ crit_new = __compute_criterion__ (s_red, criterion);
+ endif
+
+endfunction
+
+
+## ── Helper: remove a term row from a terms matrix ───────────────────────────
+
+function new_terms = __remove_term_row__ (terms, term_row)
+ ## Remove all rows matching term_row from terms matrix.
+ mask = true (rows (terms), 1);
+ for j = 1:rows (terms)
+ if (isequal (terms(j, :), term_row))
+ mask(j) = false;
+ break; ## remove only the first match
+ endif
+ endfor
+ new_terms = terms(mask, :);
+endfunction
+
+
+## ── Helper: compute criterion value from lm_fit_engine output ───────────────
+
+function val = __compute_criterion__ (s, criterion)
+ ## Compute model selection criterion from lm_fit_engine struct.
+ switch (criterion)
+ case "sse"
+ val = s.sse;
+ case "aic"
+ val = s.crit.AIC;
+ case "bic"
+ val = s.crit.BIC;
+ case "rsquared"
+ val = -s.rsq.Ordinary; ## negate so lower = better
+ case "adjrsquared"
+ val = -s.rsq.Adjusted;
+ otherwise
+ val = s.sse;
+ endswitch
+endfunction
diff --git a/inst/private/__stepwiselm_parse_inputs__.m b/inst/private/__stepwiselm_parse_inputs__.m
new file mode 100644
index 00000000..f1c30220
--- /dev/null
+++ b/inst/private/__stepwiselm_parse_inputs__.m
@@ -0,0 +1,138 @@
+## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com>
+##
+## This file is part of the statistics package for GNU Octave.
+##
+## This program is free software; you can redistribute it and/or modify it under
+## the terms of the GNU General Public License as published by the Free Software
+## Foundation; either version 3 of the License, or (at your option) any later
+## version.
+##
+## This program is distributed in the hope that it will be useful, but WITHOUT
+## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+## details.
+##
+## You should have received a copy of the GNU General Public License along with
+## this program; if not, see .
+
+## -*- texinfo -*-
+## @deftypefn {Private Function} {[@var{raw_X}, @var{raw_y}, @var{var_names}, @
+## @var{cat_flags}, @var{weights}, @var{excl_mask}, @var{modelspec}, @
+## @var{obs_names}, @var{resp_name}, @var{pred_names}, @var{has_intercept}, @
+## @var{raw_cols}, @var{lower_spec}, @var{upper_spec}, @var{p_enter}, @
+## @var{p_remove}, @var{criterion}, @var{n_steps}, @var{verbose}] =} @
+## __stepwiselm_parse_inputs__ (@var{varargin})
+##
+## Parse stepwiselm inputs. Extracts the seven stepwise-specific name-value
+## arguments (Lower, Upper, PEnter, PRemove, Criterion, NSteps, Verbose),
+## validates them, then delegates the remaining arguments to
+## @code{__fitlm_parse_inputs__} for standard fitlm parsing.
+##
+## Validation rules:
+## @itemize
+## @item @code{PEnter} must be strictly less than @code{PRemove}, regardless
+## of criterion. This matches MATLAB's behavior (even for AIC/BIC).
+## @item @code{Criterion} must be one of @code{'sse'}, @code{'aic'},
+## @code{'bic'}, @code{'rsquared'}, @code{'adjrsquared'}.
+## @item @code{NSteps} must be a positive integer or @code{Inf}.
+## @item @code{Verbose} must be 0, 1, or 2.
+## @end itemize
+##
+## @seealso{stepwiselm, __fitlm_parse_inputs__}
+## @end deftypefn
+
+function [raw_X, raw_y, var_names, cat_flags, weights, excl_mask, ...
+ modelspec, obs_names, resp_name, pred_names, has_intercept, ...
+ raw_cols, lower_spec, upper_spec, p_enter, p_remove, ...
+ criterion, n_steps, verbose] = ...
+ __stepwiselm_parse_inputs__ (varargin)
+
+ ## ── Initialize stepwise-specific defaults ─────────────────────────────────
+ lower_spec = "constant";
+ upper_spec = "interactions";
+ p_enter = 0.05;
+ p_remove = 0.10;
+ criterion = "sse";
+ n_steps = Inf;
+ verbose = 1;
+
+ ## ── Extract stepwise NV args from varargin before passing to fitlm parser ─
+ ## We walk through varargin and pull out stepwise-specific keys,
+ ## leaving everything else for __fitlm_parse_inputs__.
+ remaining = {};
+ stepwise_keys = {"lower", "upper", "penter", "premove", ...
+ "criterion", "nsteps", "verbose"};
+
+ i = 1;
+ while (i <= numel (varargin))
+ ## Non-char args (data matrices, table, numeric modelspec) pass through
+ if (! ischar (varargin{i}))
+ remaining{end+1} = varargin{i};
+ i = i + 1;
+ continue;
+ endif
+
+ key_lower = lower (varargin{i});
+
+ ## Check if this is a stepwise-specific key
+ if (any (strcmp (key_lower, stepwise_keys)))
+ if (i + 1 > numel (varargin))
+ error ("stepwiselm: name-value argument '%s' has no value.", ...
+ varargin{i});
+ endif
+ val = varargin{i+1};
+ switch (key_lower)
+ case "lower"
+ lower_spec = val;
+ case "upper"
+ upper_spec = val;
+ case "penter"
+ p_enter = double (val);
+ case "premove"
+ p_remove = double (val);
+ case "criterion"
+ criterion = lower (val);
+ case "nsteps"
+ n_steps = double (val);
+ case "verbose"
+ verbose = double (val);
+ endswitch
+ i = i + 2;
+ else
+ ## Not a stepwise key — pass through to fitlm parser
+ remaining{end+1} = varargin{i};
+ i = i + 1;
+ endif
+ endwhile
+
+ ## ── Validate stepwise arguments ───────────────────────────────────────────
+
+ ## PEnter < PRemove always required (matches MATLAB, even for AIC/BIC)
+ if (p_enter >= p_remove)
+ error ("stepwiselm: Cannot have PEnter (%g) >= PRemove (%g) for the %s criterion.", ...
+ p_enter, p_remove, criterion);
+ endif
+
+ ## Criterion must be valid
+ valid_criteria = {"sse", "aic", "bic", "rsquared", "adjrsquared"};
+ if (! any (strcmp (criterion, valid_criteria)))
+ error ("stepwiselm: Criterion must be one of: %s.", ...
+ strjoin (valid_criteria, ", "));
+ endif
+
+ ## NSteps must be positive
+ if (! (isinf (n_steps) || (n_steps > 0 && n_steps == round (n_steps))))
+ error ("stepwiselm: NSteps must be a positive integer or Inf.");
+ endif
+
+ ## Verbose must be 0, 1, or 2
+ if (! any (verbose == [0, 1, 2]))
+ error ("stepwiselm: Verbose must be 0, 1, or 2.");
+ endif
+
+ ## ── Delegate to fitlm parser ──────────────────────────────────────────────
+ [raw_X, raw_y, var_names, cat_flags, weights, excl_mask, ...
+ modelspec, obs_names, resp_name, pred_names, has_intercept, raw_cols] = ...
+ __fitlm_parse_inputs__ (remaining{:});
+
+endfunction
diff --git a/inst/private/lm_fit_engine.m b/inst/private/lm_fit_engine.m
new file mode 100644
index 00000000..1ca03d43
--- /dev/null
+++ b/inst/private/lm_fit_engine.m
@@ -0,0 +1,263 @@
+## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com>
+##
+## This file is part of the statistics package for GNU Octave.
+##
+## This program is free software; you can redistribute it and/or modify it under
+## the terms of the GNU General Public License as published by the Free Software
+## Foundation; either version 3 of the License, or (at your option) any later
+## version.
+##
+## This program is distributed in the hope that it will be useful, but WITHOUT
+## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+## details.
+##
+## You should have received a copy of the GNU General Public License along with
+## this program; if not, see .
+
+## -*- texinfo -*-
+## @deftypefn {Private Function} {@var{s} =} lm_fit_engine (@var{X}, @var{y})
+## @deftypefnx {Private Function} {@var{s} =} lm_fit_engine (@var{X}, @var{y}, @var{weights})
+##
+## QR-based ordinary least squares regression engine.
+##
+## This is the single source of truth for all linear regression math in the
+## statistics package. It is called by @code{fitlm} (Phase 4) and by
+## @code{LinearModel} methods such as @code{addTerms} and @code{removeTerms}
+## (Phase 3).
+##
+## @var{X} is the @math{n x p} design matrix (including intercept column if
+## desired). @var{y} is the @math{n x 1} response vector. @var{weights} is
+## an optional @math{n x 1} vector of observation weights; pass @code{[]} for
+## unweighted OLS.
+##
+## Returns a struct @var{s} with the following fields:
+##
+## @table @code
+## @item beta
+## Coefficient estimates (p x 1).
+## @item se
+## Standard errors (p x 1).
+## @item tstat
+## t-statistics (p x 1).
+## @item pval
+## p-values for two-sided t-test (p x 1).
+## @item vcov
+## Coefficient covariance matrix (p x p).
+## @item yhat
+## Fitted values (n x 1).
+## @item resid
+## Residuals (n x 1).
+## @item dfe
+## Error degrees of freedom.
+## @item sse
+## Sum of squared errors (residual sum of squares).
+## @item ssr
+## Regression sum of squares.
+## @item sst
+## Total sum of squares (= SSE + SSR).
+## @item mse
+## Mean squared error (SSE / DFE).
+## @item rmse
+## Root mean squared error.
+## @item logL
+## Log-likelihood under normality assumption.
+## @item p_est
+## Number of estimated (non-rank-deficient) coefficients.
+## @item p_tot
+## Total number of coefficients in the model.
+## @item rank_def
+## Logical vector indicating rank-deficient columns.
+## @item rsq
+## Struct with fields @code{Ordinary} and @code{Adjusted}.
+## @item crit
+## Struct with fields @code{AIC}, @code{AICc}, @code{BIC}, @code{CAIC}.
+## @item n
+## Number of observations.
+## @item Q
+## Economy QR factor for non-deficient columns (n x p_est). Used by
+## @code{LinearModel} to compute the hat matrix without a second QR.
+## @item h
+## Leverage vector (n x 1), equal to @code{diag(Q*Q')} = row sums of
+## squares of @var{Q}.
+## @end table
+##
+## @seealso{CompactLinearModel}
+## @end deftypefn
+
+function s = lm_fit_engine (X, y, weights)
+
+ if (nargin < 2)
+ print_usage ();
+ endif
+ if (nargin < 3)
+ weights = [];
+ endif
+
+ n = rows (X);
+ p = columns (X);
+
+ ## Weighted or unweighted QR
+ if (isempty (weights))
+ [Q, R] = qr (X, 0);
+ Qty = Q' * y;
+ else
+ sqw = sqrt (weights(:));
+ [Q, R] = qr (bsxfun (@times, sqw, X), 0);
+ Qty = Q' * (sqw .* y);
+ endif
+
+ ## Rank deficiency detection
+ ## Detect before solving to avoid spurious singular-matrix warnings
+ tol = max ([n, p]) * eps (norm (R, "inf"));
+ rank_def = abs (diag (R)) < tol;
+
+ ## Solve for beta
+ if (any (rank_def))
+ warning ("CompactLinearModel:rankDeficient", ...
+ "CompactLinearModel: rank deficient, rank = %d.", ...
+ sum (! rank_def));
+ ## Solve only the non-deficient subsystem
+ idx_good = find (! rank_def);
+ beta = zeros (p, 1);
+ R_good = R(idx_good, idx_good);
+ beta(idx_good) = R_good \ Qty(idx_good);
+ else
+ beta = R \ Qty;
+ endif
+
+ ## Residuals and sums of squares
+ yhat = X * beta;
+ resid = y - yhat;
+ p_est = sum (! rank_def); ## NumEstimatedCoefficients
+ dfe = n - p_est;
+ sse = sum (resid .^ 2);
+ ybar = mean (y);
+ ssr = sum ((yhat - ybar) .^ 2);
+ sst = sse + ssr; ## Always SSE + SSR (not sum((y-ybar)^2))
+ mse = sse / dfe;
+ rmse = sqrt (mse);
+
+ ## Coefficient covariance
+ ## Use pinv-style via R to handle rank deficiency
+ Rinv = zeros (p, p);
+ idx = find (! rank_def);
+ R_est = R(idx, idx);
+ Rinv(idx, idx) = R_est \ eye (numel (idx));
+ vcov = (Rinv * Rinv') * mse;
+
+ se = sqrt (diag (vcov));
+ tstat = beta ./ se;
+ tstat(rank_def) = NaN;
+ se(rank_def) = 0;
+ pval = 2 * (1 - tcdf (abs (tstat), dfe));
+ pval(rank_def) = NaN;
+
+ ## Log-likelihood and information criteria
+ ## Under normality, MLE variance estimate is sigma2 = SSE/n (not MSE = SSE/DFE)
+ ## This matches MATLAB's computation of LogLikelihood.
+ sigma2_mle = sse / n;
+ logL = -n/2 * log (2 * pi * sigma2_mle) - sse / (2 * sigma2_mle);
+
+ ## m = NumEstimatedCoefficients (no +1 for sigma^2)
+ ## Verified: for n=150, logL=-34.787, p_est=5 -> AIC = -2*(-34.787)+2*5 = 79.574
+ m = p_est;
+ aic = -2 * logL + 2 * m;
+ aicc = aic + (2 * m * (m + 1)) / (n - m - 1);
+ bic = -2 * logL + m * log (n);
+ caic = -2 * logL + m * (log (n) + 1);
+
+ ## Pack output struct
+ s.beta = beta;
+ s.se = se;
+ s.tstat = tstat;
+ s.pval = pval;
+ s.vcov = vcov;
+ s.yhat = yhat;
+ s.resid = resid;
+ s.n = n;
+ s.dfe = dfe;
+ s.sse = sse;
+ s.ssr = ssr;
+ s.sst = sst;
+ s.mse = mse;
+ s.rmse = rmse;
+ s.logL = logL;
+ s.p_est = p_est;
+ s.p_tot = p;
+ s.rank_def = rank_def;
+ s.rsq = struct ("Ordinary", ssr / sst, ...
+ "Adjusted", 1 - (sse / dfe) / (sst / (n - 1)));
+ s.crit = struct ("AIC", aic, "AICc", aicc, "BIC", bic, "CAIC", caic);
+
+ ## QR factor and leverage for LinearModel diagnostics
+ Q_est = Q(:, ! rank_def);
+ s.Q = Q_est;
+ s.h = sum (Q_est .^ 2, 2);
+
+endfunction
+
+## Simple 2-predictor OLS — linearly independent columns
+%!test
+%! X = [ones(5,1), [1;2;3;4;5], [2;1;4;3;6]];
+%! y = X * [1; 2; -0.5];
+%! s = lm_fit_engine (X, y);
+%! ## Perfect fit (noise-free)
+%! assert (s.beta, [1; 2; -0.5], 1e-10);
+%! assert (s.resid, zeros (5, 1), 1e-10);
+%! assert (s.p_tot, 3);
+%! assert (s.n, 5);
+
+## Known regression — y = 2 + 3*x with noise-free data
+%!test
+%! x = (1:10)';
+%! y = 2 + 3 * x;
+%! X = [ones(10,1), x];
+%! s = lm_fit_engine (X, y);
+%! assert (s.beta, [2; 3], 1e-10);
+%! assert (s.sse, 0, 1e-10);
+%! assert (s.rsq.Ordinary, 1, 1e-10);
+
+## Rank-deficient design matrix
+%!test
+%! warning ("off", "CompactLinearModel:rankDeficient");
+%! X = [1, 2, 4; 1, 3, 6; 1, 4, 8; 1, 5, 10]; ## col3 = 2*col2
+%! y = [1; 2; 3; 4];
+%! s = lm_fit_engine (X, y);
+%! assert (s.p_est, 2);
+%! assert (s.p_tot, 3);
+%! assert (any (s.rank_def), true);
+%! ## The rank-deficient coefficient should be zeroed
+%! assert (s.beta(s.rank_def), 0);
+%! assert (s.se(s.rank_def), 0);
+%! assert (isnan (s.tstat(s.rank_def)), true);
+%! assert (isnan (s.pval(s.rank_def)), true);
+%! warning ("on", "CompactLinearModel:rankDeficient");
+
+## SST = SSE + SSR identity
+%!test
+%! X = [ones(20,1), randn(20,2)];
+%! y = X * [1; 2; -1] + 0.5 * randn (20, 1);
+%! s = lm_fit_engine (X, y);
+%! assert (s.sst, s.sse + s.ssr, 1e-10);
+%! assert (s.dfe, 20 - 3);
+
+## ModelCriterion — m = p_est (no +1)
+## Verify AIC = -2*logL + 2*p_est
+%!test
+%! X = [ones(20,1), (1:20)'];
+%! y = 3 + 2 * (1:20)' + randn (20, 1) * 0.1;
+%! s = lm_fit_engine (X, y);
+%! assert (s.crit.AIC, -2 * s.logL + 2 * s.p_est, 1e-10);
+%! assert (s.crit.BIC, -2 * s.logL + s.p_est * log (s.n), 1e-10);
+
+## Weighted regression
+%!test
+%! X = [ones(5,1), [1;2;3;4;5]];
+%! y = [2; 4; 5; 4; 5];
+%! w = [1; 1; 1; 1; 1];
+%! s1 = lm_fit_engine (X, y, []);
+%! s2 = lm_fit_engine (X, y, w);
+%! ## Equal weights should give same result as unweighted
+%! assert (s1.beta, s2.beta, 1e-10);
+%! assert (s1.sse, s2.sse, 1e-10);
diff --git a/inst/stepwiselm.m b/inst/stepwiselm.m
new file mode 100644
index 00000000..37bb28ef
--- /dev/null
+++ b/inst/stepwiselm.m
@@ -0,0 +1,488 @@
+## Copyright (C) 2026 Jayant Chauhan <0001jayant@gmail.com>
+##
+## This file is part of the statistics package for GNU Octave.
+##
+## This program is free software; you can redistribute it and/or modify it under
+## the terms of the GNU General Public License as published by the Free Software
+## Foundation; either version 3 of the License, or (at your option) any later
+## version.
+##
+## This program is distributed in the hope that it will be useful, but WITHOUT
+## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+## details.
+##
+## You should have received a copy of the GNU General Public License along with
+## this program; if not, see .
+
+## -*- texinfo -*-
+## @deftypefn {statistics} {@var{mdl} =} stepwiselm (@var{X}, @var{y})
+## @deftypefnx {statistics} {@var{mdl} =} stepwiselm (@var{X}, @var{y}, @var{modelspec})
+## @deftypefnx {statistics} {@var{mdl} =} stepwiselm (@var{tbl})
+## @deftypefnx {statistics} {@var{mdl} =} stepwiselm (@var{tbl}, @var{modelspec})
+## @deftypefnx {statistics} {@var{mdl} =} stepwiselm (@dots{}, @var{Name}, @var{Value})
+##
+## Fit a linear regression model using stepwise regression.
+##
+## @code{stepwiselm} performs stepwise regression starting from an initial
+## model and iteratively adding or removing terms based on their statistical
+## significance, returning a @code{LinearModel} object.
+##
+## @var{modelspec} specifies the @strong{starting} model (default:
+## @qcode{'constant'}).
+##
+## @strong{Stepwise-specific Name-Value Arguments}
+##
+## @table @asis
+## @item @qcode{'Lower'} — minimum model (default: @qcode{'constant'})
+## @item @qcode{'Upper'} — maximum model (default: @qcode{'interactions'})
+## @item @qcode{'PEnter'} — p-value threshold to add a term (default: 0.05)
+## @item @qcode{'PRemove'} — p-value threshold to remove a term (default: 0.10)
+## @item @qcode{'Criterion'} — @qcode{'sse'}, @qcode{'aic'}, or @qcode{'bic'}
+## @item @qcode{'NSteps'} — maximum number of steps (default: Inf)
+## @item @qcode{'Verbose'} — 0 (silent), 1 (one line/step), 2 (per-candidate)
+## @end table
+##
+## All @code{fitlm} name-value arguments are also accepted.
+##
+## @seealso{fitlm, LinearModel}
+## @end deftypefn
+
+function mdl = stepwiselm (varargin)
+
+ if (nargin < 1)
+ print_usage ();
+ endif
+
+ ## ── [1] Parse inputs ──────────────────────────────────────────────────────
+ [raw_X, raw_y, var_names, cat_flags, weights, excl_mask, ...
+ modelspec, obs_names, resp_name, pred_names, has_intercept, ...
+ raw_cols, lower_spec, upper_spec, p_enter, p_remove, ...
+ criterion, n_steps, verbose] = ...
+ __stepwiselm_parse_inputs__ (varargin{:});
+
+ n_pred = numel (pred_names);
+
+ ## ── [2] Build lower/upper terms matrices ──────────────────────────────────
+ lower_terms_pred = __spec_to_terms__ (lower_spec, n_pred, true);
+ upper_terms_pred = __spec_to_terms__ (upper_spec, n_pred, true);
+
+ ## Add response column (always 0)
+ lower_terms = [lower_terms_pred, zeros(rows (lower_terms_pred), 1)];
+ upper_terms = [upper_terms_pred, zeros(rows (upper_terms_pred), 1)];
+
+ ## ── [3] Fit starting model ────────────────────────────────────────────────
+ cat_idx = find (cat_flags(1:n_pred));
+ fitlm_args = {};
+ if (! isempty (cat_idx))
+ fitlm_args = [fitlm_args, {'CategoricalVars', cat_idx}];
+ endif
+ if (any (weights != 1))
+ fitlm_args = [fitlm_args, {'Weights', weights}];
+ endif
+ if (any (excl_mask))
+ fitlm_args = [fitlm_args, {'Exclude', excl_mask}];
+ endif
+ if (! isempty (var_names))
+ fitlm_args = [fitlm_args, {'VarNames', var_names}];
+ endif
+
+ if (isa (raw_X, "table"))
+ mdl0 = fitlm (raw_X, modelspec, fitlm_args{:});
+ else
+ mdl0 = fitlm (raw_X, raw_y, modelspec, fitlm_args{:});
+ endif
+
+ current_terms = mdl0.Formula.Terms;
+ incl_mask = mdl0.ObservationInfo.Subset;
+
+ ## ── [4] Initialize step log ───────────────────────────────────────────────
+ start_lp = mdl0.Formula.LinearPredictor;
+ crit_current = __get_model_crit__ (mdl0, criterion);
+
+ n_log = 1;
+ log_action = {'Start'};
+ log_term = {start_lp};
+ log_terms = {current_terms};
+ log_df = mdl0.NumCoefficients;
+ log_del_df = NaN;
+ log_fstat = NaN;
+ log_pval = NaN;
+ log_critval = crit_current;
+
+ ## ── [5] Check for empty candidate set ─────────────────────────────────────
+ all_candidates = __stepwiselm_candidates__ (current_terms, lower_terms, ...
+ upper_terms, cat_flags, ...
+ pred_names);
+ if (isempty (all_candidates) && verbose >= 1)
+ fprintf ("No terms to add to or remove from initial model.\n");
+ endif
+
+ ## ── [6] Search loop ───────────────────────────────────────────────────────
+ n_taken = 0;
+ use_ftest = strcmp (criterion, "sse");
+
+ while (n_taken < n_steps)
+ made_change = false;
+
+ ## ── Phase A: BACKWARD SCAN (keep removing) ────────────────────────────
+ while (n_taken < n_steps)
+ candidates = __stepwiselm_candidates__ (current_terms, lower_terms, ...
+ upper_terms, cat_flags, ...
+ pred_names);
+ ## Filter to remove only
+ rem_idx = [];
+ for ci = 1:numel (candidates)
+ if (strcmp (candidates(ci).action, "remove"))
+ rem_idx(end+1) = ci;
+ endif
+ endfor
+
+ if (isempty (rem_idx))
+ break;
+ endif
+
+ ## Score each removal candidate
+ best_ri = 0;
+ best_pval = -Inf;
+ best_fstat = 0;
+ best_crit = Inf;
+ best_df = 0;
+
+ for ri = 1:numel (rem_idx)
+ c = candidates(rem_idx(ri));
+ [f, p, ~, ~, df_t, cr] = __stepwiselm_ftest__ ( ...
+ raw_X, raw_y, weights, incl_mask, current_terms, c, ...
+ cat_flags, var_names, pred_names, raw_cols, has_intercept, ...
+ criterion);
+
+ if (verbose >= 2)
+ if (use_ftest)
+ fprintf (" pValue for removing %s is %g\n", c.term_name, p);
+ else
+ fprintf (" %s for removing %s is %g\n", ...
+ upper (criterion), c.term_name, cr);
+ endif
+ endif
+
+ should_take = false;
+ if (use_ftest)
+ if (p > p_remove && p > best_pval)
+ should_take = true;
+ endif
+ else
+ if (cr < crit_current && cr < best_crit)
+ should_take = true;
+ endif
+ endif
+
+ if (should_take)
+ best_ri = rem_idx(ri);
+ best_pval = p;
+ best_fstat = f;
+ best_crit = cr;
+ best_df = df_t;
+ endif
+ endfor
+
+ if (best_ri == 0)
+ break; ## no qualifying removal
+ endif
+
+ ## Apply removal
+ best_c = candidates(best_ri);
+ mask = true (rows (current_terms), 1);
+ for j = 1:rows (current_terms)
+ if (isequal (current_terms(j, :), best_c.term_row))
+ mask(j) = false;
+ break;
+ endif
+ endfor
+ current_terms = current_terms(mask, :);
+ n_taken++;
+ made_change = true;
+
+ ## Update criterion
+ if (! use_ftest)
+ crit_current = best_crit;
+ endif
+
+ ## Log
+ n_log++;
+ log_action{n_log} = "Remove";
+ log_term{n_log} = best_c.term_name;
+ log_terms{n_log} = current_terms;
+ log_df(n_log) = log_df(n_log - 1) - best_df;
+ log_del_df(n_log) = -best_df;
+ log_fstat(n_log) = best_fstat;
+ log_pval(n_log) = best_pval;
+ log_critval(n_log) = best_crit;
+
+ ## Verbose
+ if (verbose >= 1)
+ __print_step__ (n_taken, best_c, best_fstat, best_pval, ...
+ best_crit, criterion);
+ endif
+ endwhile ## backward
+
+ if (n_taken >= n_steps)
+ break;
+ endif
+
+ ## ── Phase B: FORWARD SCAN (add one) ───────────────────────────────────
+ candidates = __stepwiselm_candidates__ (current_terms, lower_terms, ...
+ upper_terms, cat_flags, ...
+ pred_names);
+ ## Filter to add only
+ add_idx = [];
+ for ci = 1:numel (candidates)
+ if (strcmp (candidates(ci).action, "add"))
+ add_idx(end+1) = ci;
+ endif
+ endfor
+
+ if (isempty (add_idx))
+ break; ## converged
+ endif
+
+ ## Score each add candidate
+ best_ai = 0;
+ best_pval_a = Inf;
+ best_fstat_a = 0;
+ best_crit_a = Inf;
+ best_df_a = 0;
+
+ for ai = 1:numel (add_idx)
+ c = candidates(add_idx(ai));
+ [f, p, ~, ~, df_t, cr] = __stepwiselm_ftest__ ( ...
+ raw_X, raw_y, weights, incl_mask, current_terms, c, ...
+ cat_flags, var_names, pred_names, raw_cols, has_intercept, ...
+ criterion);
+
+ if (verbose >= 2)
+ if (use_ftest)
+ fprintf (" pValue for adding %s is %g\n", c.term_name, p);
+ else
+ fprintf (" %s for adding %s is %g\n", ...
+ upper (criterion), c.term_name, cr);
+ endif
+ endif
+
+ should_take = false;
+ if (use_ftest)
+ if (p < p_enter && p < best_pval_a)
+ should_take = true;
+ endif
+ else
+ if (cr < crit_current && cr < best_crit_a)
+ should_take = true;
+ endif
+ endif
+
+ if (should_take)
+ best_ai = add_idx(ai);
+ best_pval_a = p;
+ best_fstat_a = f;
+ best_crit_a = cr;
+ best_df_a = df_t;
+ endif
+ endfor
+
+ if (best_ai == 0)
+ if (! made_change)
+ break; ## converged — no add and no remove happened
+ endif
+ break;
+ endif
+
+ ## Apply addition
+ best_c = candidates(best_ai);
+ current_terms = [current_terms; best_c.term_row];
+ ## Sort by term order
+ term_order = sum (current_terms(:, 1:n_pred), 2);
+ [~, si] = sort (term_order);
+ current_terms = current_terms(si, :);
+ n_taken++;
+ made_change = true;
+
+ ## Update criterion
+ if (! use_ftest)
+ crit_current = best_crit_a;
+ endif
+
+ ## Log
+ n_log++;
+ log_action{n_log} = "Add";
+ log_term{n_log} = best_c.term_name;
+ log_terms{n_log} = current_terms;
+ log_df(n_log) = log_df(n_log - 1) + best_df_a;
+ log_del_df(n_log) = best_df_a;
+ log_fstat(n_log) = best_fstat_a;
+ log_pval(n_log) = best_pval_a;
+ log_critval(n_log) = best_crit_a;
+
+ ## Verbose
+ if (verbose >= 1)
+ __print_step__ (n_taken, best_c, best_fstat_a, best_pval_a, ...
+ best_crit_a, criterion);
+ endif
+
+ ## Continue outer loop → back to backward scan
+ endwhile
+
+ ## ── [7] Fit final model ───────────────────────────────────────────────────
+ if (isa (raw_X, "table"))
+ mdl = fitlm (raw_X, current_terms, fitlm_args{:});
+ else
+ mdl = fitlm (raw_X, raw_y, current_terms, fitlm_args{:});
+ endif
+
+ ## ── [8] Build Steps struct ────────────────────────────────────────────────
+ Steps.Start = [resp_name, " ~ ", log_term{1}];
+ Steps.Lower = lower_spec;
+ Steps.Upper = upper_spec;
+ Steps.Criterion = upper (criterion);
+ Steps.PEnter = p_enter;
+ Steps.PRemove = p_remove;
+
+ ## Build History struct (mimicking a table)
+ History.Action = log_action(:);
+ History.TermName = cell (n_log, 1);
+ for k = 1:n_log
+ History.TermName{k} = {log_term{k}};
+ endfor
+ History.Terms = log_terms(:);
+ History.DF = log_df(:);
+ History.delDF = log_del_df(:);
+
+ if (use_ftest)
+ History.FStat = log_fstat(:);
+ History.pValue = log_pval(:);
+ else
+ ## For AIC/BIC, the column is the criterion name, not FStat/pValue
+ History.(upper (criterion)) = log_critval(:);
+ endif
+
+ Steps.History = History;
+
+ ## ── [9] Inject Steps into LinearModel ─────────────────────────────────────
+ mdl.Steps = Steps;
+
+endfunction
+
+
+## ═══════════════════════════════════════════════════════════════════════════════
+## Local helper functions
+## ═══════════════════════════════════════════════════════════════════════════════
+
+function terms = __spec_to_terms__ (spec, p, has_intercept)
+ ## Convert a shorthand modelspec string to a (n_terms x p) terms matrix.
+ ## Also handles formula strings 'y ~ x1 + x2' and numeric matrices.
+
+ if (isnumeric (spec) && ismatrix (spec))
+ terms = spec;
+ ## Strip response column if present
+ if (columns (terms) == p + 1)
+ terms = terms(:, 1:p);
+ endif
+ return;
+ endif
+
+ if (! ischar (spec))
+ spec = "interactions";
+ endif
+
+ ## Check for formula string
+ if (! isempty (strfind (spec, "~")))
+ ## Formula — extract terms. For upper/lower bounds this is unusual
+ ## but handle gracefully: just use 'interactions' as fallback
+ spec = "interactions";
+ endif
+
+ switch (lower (spec))
+ case "constant"
+ terms = zeros (1, p);
+
+ case "linear"
+ terms = [zeros(1, p); eye(p)];
+
+ case "interactions"
+ lin = eye (p);
+ inter = [];
+ for i = 1:p-1
+ for j = i+1:p
+ r = zeros (1, p);
+ r(i) = 1; r(j) = 1;
+ inter = [inter; r];
+ endfor
+ endfor
+ terms = [zeros(1, p); lin; inter];
+
+ case "purequadratic"
+ terms = [zeros(1, p); eye(p); 2*eye(p)];
+
+ case {"quadratic", "full"}
+ lin = eye (p);
+ inter = [];
+ for i = 1:p-1
+ for j = i+1:p
+ r = zeros (1, p);
+ r(i) = 1; r(j) = 1;
+ inter = [inter; r];
+ endfor
+ endfor
+ terms = [zeros(1, p); lin; inter; 2*eye(p)];
+
+ otherwise
+ terms = [zeros(1, p); eye(p)];
+ endswitch
+endfunction
+
+
+function val = __get_model_crit__ (mdl, criterion)
+ switch (criterion)
+ case "sse"
+ val = mdl.SSE;
+ case "aic"
+ val = mdl.ModelCriterion.AIC;
+ case "bic"
+ val = mdl.ModelCriterion.BIC;
+ case "rsquared"
+ val = -mdl.Rsquared.Ordinary;
+ case "adjrsquared"
+ val = -mdl.Rsquared.Adjusted;
+ otherwise
+ val = mdl.SSE;
+ endswitch
+endfunction
+
+
+function __print_step__ (step_num, candidate, fstat, pval, critval, criterion)
+ action_str = [toupper(candidate.action(1)), candidate.action(2:end)];
+ action_verb = "Adding";
+ if (strcmp (candidate.action, "remove"))
+ action_verb = "Removing";
+ endif
+
+ if (strcmp (criterion, "sse"))
+ fprintf ("%d. %s %s, FStat = %g, pValue = %g\n", ...
+ step_num, action_verb, candidate.term_name, fstat, pval);
+ else
+ fprintf ("%d. %s %s, %s = %g\n", ...
+ step_num, action_verb, candidate.term_name, ...
+ upper (criterion), critval);
+ endif
+endfunction
+
+
+## ═══════════════════════════════════════════════════════════════════════════════
+## Unit tests
+## ═══════════════════════════════════════════════════════════════════════════════
+
+## Test 1: PEnter >= PRemove error
+%!error
+%! rng (1);
+%! X = randn (20, 3);
+%! y = randn (20, 1);
+%! stepwiselm (X, y, 'constant', 'PEnter', 0.1, 'PRemove', 0.05);
+