Source code for models.stacking_model

"""
Cerebrum Forex - Stacking Ensemble Model
Meta-learner that combines XGBoost, LightGBM, RandomForest predictions.
Replaces LSTM for faster training without TensorFlow dependency.
"""

import logging
import pickle
import numpy as np
from pathlib import Path
from typing import Tuple, Optional

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from config.settings import IS_FROZEN
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from .base_model import BaseModel

logger = logging.getLogger(__name__)



[docs]
class StackingModel(BaseModel):
    """
    Stacking Ensemble Model using XGBoost, LightGBM, RandomForest as base learners
    and LogisticRegression as meta-learner.
    
    Benefits over LSTM:
    - 10-20x faster training (no deep learning)
    - No TensorFlow/GPU dependency
    - Often better for tabular data
    """
    
    def __init__(self, timeframe: str, model_dir: Path):
        super().__init__(timeframe, model_dir)
        self.model_path = model_dir / f"stacking_{timeframe}.pkl"
        
    @property
    def name(self) -> str:
        return "stacking"
    
    def _build_model(self, n_classes: int):
        """Build the stacking classifier"""
        # Base estimators (lighter versions for stacking)
        # NOTE: CatBoost removed - not compatible with sklearn 1.8 StackingClassifier
        # (CatBoostClassifier lacks __sklearn_tags__ method)
        from sklearn.ensemble import RandomForestClassifier
        
        base_estimators = [
            ('xgb', XGBClassifier(
                n_estimators=50,
                max_depth=4,
                learning_rate=0.1,
                subsample=0.8,              # Regularization: use 80% of samples
                colsample_bytree=0.8,       # Regularization: use 80% of features
                min_child_weight=5,         # Regularization: min samples per leaf
                objective='multi:softprob' if n_classes > 2 else 'binary:logistic',
                num_class=n_classes if n_classes > 2 else None,
                verbosity=0,
                n_jobs=1 if IS_FROZEN else -1
            )),
            ('lgb', LGBMClassifier(
                n_estimators=50,
                max_depth=4,
                learning_rate=0.1,
                subsample=0.8,              # Regularization: use 80% of samples
                colsample_bytree=0.8,       # Regularization: use 80% of features
                min_child_samples=20,       # Regularization: min samples per leaf
                objective='multiclass' if n_classes > 2 else 'binary',
                num_class=n_classes if n_classes > 2 else None,
                verbose=-1,
                n_jobs=1 if IS_FROZEN else -1
            )),
            ('rf', RandomForestClassifier(
                n_estimators=50,
                max_depth=6,
                max_samples=0.8,            # Regularization: use 80% of samples per tree
                min_samples_leaf=5,         # Regularization: min samples per leaf
                n_jobs=1 if IS_FROZEN else -1,
                random_state=42
            ))
        ]
        
        # Meta-learner (multi_class removed in sklearn 1.8+, auto-determined)
        meta_learner = LogisticRegression(
            max_iter=1000,
            solver='lbfgs',
            n_jobs=1 if IS_FROZEN else -1
        )
        
        return StackingClassifier(
            estimators=base_estimators,
            final_estimator=meta_learner,
            cv=3,  # 3-fold CV for base predictions
            stack_method='predict_proba',
            n_jobs=1 if IS_FROZEN else -1,
            passthrough=False  # Only use base predictions, not original features
        )
    

[docs]
    def train(self, X: np.ndarray, y: np.ndarray,
              X_val: np.ndarray = None, y_val: np.ndarray = None,
              class_weights: np.ndarray = None) -> float:
        """Train the stacking model"""
        try:
            n_classes = len(np.unique(y))
            logger.info(f"[Stacking {self.timeframe}] Training with {len(X)} samples, {n_classes} classes")
            
            # Build and train
            self.model = self._build_model(n_classes)
            
            # Use sample weights if provided (convert array to sample_weight format)
            sample_weight = class_weights if class_weights is not None else None
            
            self.model.fit(X, y, sample_weight=sample_weight)
            self.is_trained = True
            
            # Evaluate
            if X_val is not None and len(X_val) > 0:
                y_pred = self.model.predict(X_val)
                self.accuracy = balanced_accuracy_score(y_val, y_pred)
            else:
                # Use training accuracy as fallback
                y_pred = self.model.predict(X)
                self.accuracy = balanced_accuracy_score(y, y_pred)
            
            logger.info(f"[Stacking {self.timeframe}] ✓ Balanced Accuracy: {self.accuracy:.2%}")
            
            self.save()
            return self.accuracy
            
        except Exception as e:
            logger.error(f"Stacking training failed: {e}", exc_info=True)
            return 0.0

    

[docs]
    def predict(self, X: np.ndarray) -> Tuple[str, float]:
        """Make prediction"""
        if not self.is_trained and not self.load():
            return "NEUTRAL", 0.0
        
        try:
            # Handle Feature Mismatch (Train vs Predict)
            # 1. New System
            expected_features = self.feature_names
            
            # 2. Legacy Fallback
            if not expected_features and hasattr(self.model, 'feature_names_in_'):
                expected_features = self.model.feature_names_in_

            # Use the determined expected features to align input
            if hasattr(X, 'columns') and expected_features:
                # Check if we have all required features
                missing = [f for f in expected_features if f not in X.columns]
                if missing:
                    logger.warning(f"StackingModel mismatch: Missing {len(missing)} features ({missing[:3]}...). Returning NEUTRAL.")
                    return "NEUTRAL", 0.0
                
                # Check for "Unnamed" features vs Named features mismatch (Legacy)
                if len(expected_features) > 0 and str(expected_features[0]).startswith("Column_") and not str(X.columns[0]).startswith("Column_"):
                     logger.warning(f"StackingModel schema mismatch: Model expects raw features (Column_X) but got named features. Returning NEUTRAL. (Retrain required)")
                     return "NEUTRAL", 0.0

                # Select only expected columns in correct order
                X = X[expected_features]

            # Ensure 2D
            if hasattr(X, 'values'):
                X = X.values

            if X.ndim == 1:
                X = X.reshape(1, -1)
            
            # Get probabilities
            proba = self.model.predict_proba(X)
            
            # Get prediction for last row
            pred_proba = proba[-1] if len(proba) > 1 else proba[0]
            pred_class = np.argmax(pred_proba)
            confidence = float(pred_proba[pred_class])
            
            signal = self.signal_from_prediction(pred_class)
            
            return signal, confidence
            
        except Exception as e:
            logger.error(f"Stacking prediction failed: {e}")
            return "NEUTRAL", 0.0

    
    

[docs]
    def load(self) -> bool:
        """Load model from disk"""
        if not self.model_path.exists():
            return False
        
        try:
            with open(self.model_path, 'rb') as f:
                data = pickle.load(f)
                self.model = data['model']
                self.accuracy = data.get('accuracy', 0.0)
                self.feature_names = data.get('feature_names', [])
                self.is_trained = data.get('is_trained', True)
            
            logger.info(f"Stacking model loaded from {self.model_path} ({len(self.feature_names)} features)")
            return True
        except Exception as e:
            logger.error(f"Failed to load Stacking model: {e}")
            return False