1- import numpy as np ;
2- from sklearn .base import BaseEstimator , ClassifierMixin ;
3- from sklearn .metrics import pairwise_distances ;
4- from sklearn .utils import check_X_y , check_array ;
5-
6- class SemiBoost (BaseEstimator , ClassifierMixin ):
7- def __init__ (self , base_estimator = None , n_neighbors = 5 , n_estimators = 100 , max_iter = 50 , learning_rate = 1.0 , random_state = 42 ):
8- self .base_estimator = base_estimator ;
9- self .n_neighbors = n_neighbors ;
10- self .n_estimators = n_estimators ;
11- self .max_iter = max_iter ;
12- self .learning_rate = learning_rate ;
13- self .random_state = random_state ;
14-
15- def train (self , X , y ):
16- X , y = check_X_y (X , y );
17-
18- self .classes_ = np .unique (y );
19- self .X_ = X ;
20- self .y_ = y ;
21-
22- # Initialize weights
23- self .weights_ = np .ones (len (y )) / len (y );
24-
25- for iteration in range (self .max_iter ):
26- # Compute pairwise distances
27- distances = pairwise_distances (X );
28- np .fill_diagonal (distances , np .inf );
29- np .nan_to_num (distances , posinf = 1e10 , neginf = - 1e10 , copy = False );
30-
31- # Find nearest neighbors
32- neighbors = np .argsort (distances , axis = 1 )[:, :self .n_neighbors ];
33-
34- # Compute similarity matrix
35- S = np .exp (- distances ** 2 / (2. * np .var (distances )));
36-
37- # Update weights
38- for i in range (len (y )):
39- for j in neighbors [i ]:
40- if y [i ] == y [j ]:
41- self .weights_ [i ] *= np .exp (- self .learning_rate * S [i , j ]);
42- else :
43- self .weights_ [i ] *= np .exp (self .learning_rate * S [i , j ]);
44-
45- # Normalize weights
46- self .weights_ /= np .sum (self .weights_ );
47-
48- return self ;
49-
50- def predict (self , X ):
51- X = check_array (X );
52- distances = pairwise_distances (X , self .X_ );
53- predictions = [];
54-
55- for i in range (len (X )):
56- weighted_votes = np .zeros (len (self .classes_ ));
57- for j in range (len (self .X_ )):
58- weighted_votes [self .y_ [j ]] += self .weights_ [j ] / distances [i , j ];
59- predictions .append (self .classes_ [np .argmax (weighted_votes )]);
60-
61- return np .array (predictions );
62-
63- def predict_proba (self , X ):
64- X = check_array (X );
65- distances = pairwise_distances (X , self .X_ );
66- proba = [];
67-
68- for i in range (len (X )):
69- weighted_votes = np .zeros (len (self .classes_ ));
70- for j in range (len (self .X_ )):
71- weighted_votes [self .y_ [j ]] += self .weights_ [j ] / distances [i , j ];
72- proba .append (weighted_votes / np .sum (weighted_votes ));
73-
74- return np .array (proba );
1+ from sklearn .ensemble import AdaBoostClassifier , GradientBoostingClassifier
2+ from sklearn .semi_supervised import SelfTrainingClassifier
3+ from sklearn .tree import DecisionTreeClassifier
4+ from sklearn .metrics import accuracy_score , classification_report , roc_auc_score , confusion_matrix , roc_curve
5+ import matplotlib .pyplot as plt
6+ import seaborn as sns
7+ import pickle
8+ import os
9+ from joblib import dump
10+
11+ def load_split_data_pickle (directory = 'split_data_pickle' ):
12+ """
13+ Loads the split and processed data saved as pickle files.
14+ """
15+ filenames = ['X_train.pkl' , 'X_test.pkl' , 'y_train.pkl' , 'y_test.pkl' , 'scaler.pkl' ]
16+ loaded_data = {}
17+ for filename in filenames :
18+ file_path = os .path .join (directory , filename )
19+ if os .path .exists (file_path ):
20+ with open (file_path , 'rb' ) as file :
21+ loaded_data [filename ] = pickle .load (file )
22+ print (f"Loaded '{ filename } ' from '{ directory } /'" )
23+ else :
24+ raise FileNotFoundError (f"'{ filename } ' not found in '{ directory } /'" )
25+ return loaded_data ['X_train.pkl' ], loaded_data ['X_test.pkl' ], loaded_data ['y_train.pkl' ], loaded_data ['y_test.pkl' ], \
26+ loaded_data ['scaler.pkl' ]
27+
28+
29+ def save_model (model , directory = 'models' , filename = 'semi_boost_model.joblib' ):
30+ """
31+ Saves the trained model using Joblib.
32+ """
33+ os .makedirs (directory , exist_ok = True )
34+ model_path = os .path .join (directory , filename )
35+ dump (model , model_path )
36+ print (f"Model saved to '{ model_path } '" )
37+
38+
39+ def build_semi_boost_model (X_train , X_test , y_train , y_test ):
40+ """
41+ Builds and evaluates a SemiBoost-like model using AdaBoost with self-training.
42+ """
43+ # Initialize base estimator as a weak learner
44+ base_estimator = DecisionTreeClassifier (max_depth = 1 , random_state = 42 )
45+
46+ # Initialize AdaBoost Classifier with the weak learner
47+ ada_clf = AdaBoostClassifier (estimator = base_estimator , n_estimators = 50 , random_state = 42 )
48+
49+ # Initialize Self-Training Classifier with AdaBoost as base
50+ self_training_clf = SelfTrainingClassifier (base_estimator = ada_clf , threshold = 0.8 , max_iter = 10 , verbose = True )
51+
52+ # Fit the model
53+ print ("\n Training SemiBoost-like Classifier..." )
54+ self_training_clf .fit (X_train , y_train )
55+
56+ # Save the model
57+ save_model (self_training_clf , directory = 'models' , filename = 'semi_boost_model.joblib' )
58+
59+ # Predictions
60+ y_pred = self_training_clf .predict (X_test )
61+ y_pred_proba = self_training_clf .predict_proba (X_test )[:, 1 ]
62+
63+ # Evaluation Metrics
64+ accuracy = accuracy_score (y_test , y_pred )
65+ auc = roc_auc_score (y_test , y_pred_proba )
66+ print (f"SemiBoost Test Accuracy: { accuracy :.4f} " )
67+ print (f"SemiBoost AUC-ROC Score: { auc :.4f} " )
68+ print ("\n Classification Report:" )
69+ print (classification_report (y_test , y_pred ))
70+
71+ # Confusion Matrix
72+ conf_matrix = confusion_matrix (y_test , y_pred )
73+ plt .figure (figsize = (6 , 4 ))
74+ sns .heatmap (conf_matrix , annot = True , fmt = 'd' , cmap = 'Greens' ,
75+ xticklabels = ['Class 0' , 'Class 1' ],
76+ yticklabels = ['Class 0' , 'Class 1' ])
77+ plt .ylabel ('Actual' )
78+ plt .xlabel ('Predicted' )
79+ plt .title ('SemiBoost Confusion Matrix' )
80+ plt .show ()
81+
82+ # ROC Curve
83+ fpr , tpr , thresholds = roc_curve (y_test , y_pred_proba )
84+ plt .figure (figsize = (6 , 4 ))
85+ plt .plot (fpr , tpr , color = 'darkgreen' , lw = 2 , label = f'ROC curve (AUC = { auc :.2f} )' )
86+ plt .plot ([0 , 1 ], [0 , 1 ], color = 'gray' , lw = 2 , linestyle = '--' )
87+ plt .xlabel ('False Positive Rate' )
88+ plt .ylabel ('True Positive Rate' )
89+ plt .title ('SemiBoost ROC Curve' )
90+ plt .legend (loc = "lower right" )
91+ plt .show ()
92+
93+
94+ def main ():
95+ # Load data
96+ X_train , X_test , y_train , y_test , scaler = load_split_data_pickle ()
97+
98+ # For SemiBoost-like approach, assume some unlabeled data
99+ import numpy as np
100+ unlabeled_fraction = 0.1
101+ n_unlabeled = int (len (y_train ) * unlabeled_fraction )
102+ np .random .seed (42 )
103+ unlabeled_indices = np .random .choice (y_train .index , n_unlabeled , replace = False )
104+ y_train_unlabeled = y_train .copy ()
105+ y_train_unlabeled .loc [unlabeled_indices ] = - 1 # Mark as unlabeled
106+
107+ # Build and evaluate the SemiBoost-like model
108+ build_semi_boost_model (X_train , X_test , y_train_unlabeled , y_test )
109+
110+
111+ if __name__ == "__main__" :
112+ main ()
0 commit comments