Source code for models.xgboost_model

"""
Cerebrum Forex - XGBoost Model
Gradient boosting model for signal prediction.
"""

import logging
import pickle
from pathlib import Path
from typing import Tuple

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from config.settings import default_settings, IS_FROZEN

try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False

from .base_model import BaseModel

logger = logging.getLogger(__name__)



[docs]
class XGBoostModel(BaseModel):
    """XGBoost model for forex signal prediction"""
    
    def __init__(self, timeframe: str, model_dir: Path):
        super().__init__(timeframe, model_dir)
        
        if not XGBOOST_AVAILABLE:
            logger.error("XGBoost not installed")
            return
        
        self.params = {
            'objective': 'multi:softmax',
            'max_depth': 6,              # Reduced from 8 to prevent overfitting
            'learning_rate': 0.03,       # Reduced from 0.05 for slower, more robust convergence
            'n_estimators': 800,         # Increased from 500 to compensate for lower LR
            'subsample': 0.6,            # Reduced from 0.7 for more randomness
            'colsample_bytree': 0.6,     # Reduced from 0.7
            'gamma': 0.2,                # Increased min loss reduction
            'min_child_weight': 5,       # Increased from 3 (requires more samples per leaf)
            'random_state': 42,
            'eval_metric': 'mlogloss',
            'tree_method': 'hist',       # Faster training
            'n_jobs': 1 if IS_FROZEN else 2,                # Throttled to prevent CPU contention
        }
    
    @property
    def name(self) -> str:
        return "xgboost"
    

[docs]
    def train(self, X: np.ndarray, y: np.ndarray, 
              X_val: np.ndarray = None, y_val: np.ndarray = None,
              class_weights: dict = None) -> float:
        """Train XGBoost model with optional external validation set and class weights"""
        if not XGBOOST_AVAILABLE:
            logger.error("XGBoost not available")
            return 0.0
        
        try:
            # Ensure y is 1D array (fix multilabel error)
            y = np.asarray(y).ravel()
            
            # Detect number of classes dynamically
            unique_classes = np.unique(y)
            num_classes = len(unique_classes)
            logger.info(f"[XGBoost {self.timeframe}] Detected {num_classes} classes: {unique_classes}")
            
            # Adjust objective based on class count
            if num_classes == 2:
                self.params['objective'] = 'binary:logistic'
                if 'num_class' in self.params:
                    del self.params['num_class']
            else:
                self.params['objective'] = 'multi:softmax'
                self.params['num_class'] = num_classes
            
            logger.info(f"[XGBoost {self.timeframe}] Training with {len(X)} samples, {X.shape[1]} features")
            
            # Use external val set if provided, else split internally
            if X_val is None or y_val is None:
                logger.info(f"[XGBoost {self.timeframe}] No external val set, doing internal split")
                logger.info(f"[XGBoost {self.timeframe}] X.shape={X.shape}, y.shape={y.shape}, y.dtype={y.dtype}")
                # Try stratified split, fallback to regular split if fails
                try:
                    X_train, X_val, y_train, y_val = train_test_split(
                        X, y, test_size=default_settings.validation_ratio, random_state=42, stratify=y
                    )
                    logger.info(f"[XGBoost {self.timeframe}] Stratified split OK")
                except ValueError as e:
                    logger.warning(f"[XGBoost {self.timeframe}] Stratified failed: {e}, using regular split")
                    # Stratify fails with too few samples per class
                    X_train, X_val, y_train, y_val = train_test_split(
                        X, y, test_size=default_settings.validation_ratio, random_state=42
                    )
            else:
                X_train, y_train = X, y
                y_val = np.asarray(y_val).ravel()
            
            # Log shapes after split
            logger.info(f"[XGBoost {self.timeframe}] After split: X_train={X_train.shape}, X_val={X_val.shape}, y_train={y_train.shape}, y_val={y_val.shape}")
            
            # Apply class weights if provided
            sample_weights = None
            if class_weights is not None and (hasattr(class_weights, '__len__') and len(class_weights) > 0):
                if isinstance(class_weights, dict):
                    sample_weights = np.array([class_weights.get(int(label), 1.0) for label in y_train])
                elif hasattr(class_weights, 'size') and class_weights.size > 0:
                    sample_weights = class_weights
                logger.info(f"[XGBoost {self.timeframe}] sample_weights.shape={sample_weights.shape if sample_weights is not None else 'None'}")
            
            # Create and train model
            self.model = xgb.XGBClassifier(**self.params)
            logger.info(f"[XGBoost {self.timeframe}] Calling model.fit()...")
            
            try:
                # NOTE: Removed eval_set to avoid [75, 225] error with softprob
                # The error was caused by some internal XGBoost issue with eval_set
                self.model.fit(
                    X_train, y_train,
                    sample_weight=sample_weights,
                    verbose=False
                )
            except Exception as fit_err:
                logger.error(f"[XGBoost {self.timeframe}] model.fit failed: {fit_err}")
                import traceback
                logger.error(traceback.format_exc())
                raise
            
            # Evaluate
            y_pred = self.model.predict(X_val)
            # USE BALANCED ACCURACY to avoid cheating with Majority Class (Neutral)
            self.accuracy = balanced_accuracy_score(y_val.ravel(), y_pred.ravel())
            self.is_trained = True
            
            logger.info(f"[XGBoost {self.timeframe}] ✓ Balanced Accuracy: {self.accuracy:.2%}")
            
            # Save model
            self.save()
            
            return self.accuracy
            
        except Exception as e:
            logger.error(f"XGBoost training failed: {e}", exc_info=True)
            return 0.0

    

[docs]
    def predict(self, X: np.ndarray) -> Tuple[str, float]:
        """Make prediction with XGBoost"""
        if not self.is_trained and not self.load():
            logger.warning("XGBoost model not trained")
            return "NEUTRAL", 0.0
        
        try:
            # Handle Feature Mismatch (Train vs Predict)
            # 1. Try to use saved feature_names (New System)
            expected_features = self.feature_names
            
            # 2. Fallback: Try to get from underlying model (Legacy System)
            if not expected_features and hasattr(self.model, 'feature_names_in_'):
                expected_features = self.model.feature_names_in_

            # Use the determined expected features to align input
            if hasattr(X, 'columns') and expected_features:
                # Check if we have all required features
                missing = [f for f in expected_features if f not in X.columns]
                if missing:
                    logger.warning(f"XGBoost mismatch: Missing {len(missing)} features ({missing[:3]}...). Returning NEUTRAL.")
                    return "NEUTRAL", 0.0
                
                # Check for "Unnamed" features vs Named features mismatch (Legacy Check)
                if len(expected_features) > 0 and str(expected_features[0]).startswith("Column_") and not str(X.columns[0]).startswith("Column_"):
                     logger.warning(f"XGBoost schema mismatch: Model expects raw features (Column_X) but got named features. Returning NEUTRAL.")
                     return "NEUTRAL", 0.0

                # Select only expected columns in correct order
                X = X[expected_features]

            # Ensure 2D input
            if hasattr(X, 'values'):
                X = X.values
            
            if X.ndim == 1:
                X = X.reshape(1, -1)
            
            # Get probabilities
            proba = self.model.predict_proba(X)
            pred_class = np.argmax(proba, axis=1)[0]
            confidence = proba[0][pred_class]
            
            signal = self.signal_from_prediction(pred_class)
            
            return signal, float(confidence)
            
        except Exception as e:
            logger.error(f"XGBoost prediction failed: {e}")
            return "NEUTRAL", 0.0

    
    

[docs]
    def load(self) -> bool:
        """Load model from pickle file"""
        if not self.model_path.exists():
            return False
        
        try:
            with open(self.model_path, 'rb') as f:
                data = pickle.load(f)
                self.model = data['model']
                self.accuracy = data.get('accuracy', 0.0)
                self.params = data.get('params', self.params)
                self.feature_names = data.get('feature_names', [])
                self.is_trained = True
            
            logger.info(f"XGBoost model loaded from {self.model_path} ({len(self.feature_names)} features)")
            return True
        except Exception as e:
            logger.error(f"Failed to load XGBoost model: {e}")
            return False