Skip to content

Commit 197f521

Browse files
authored
Add files via upload
1 parent 12c507d commit 197f521

File tree

1 file changed

+112
-74
lines changed

1 file changed

+112
-74
lines changed
Lines changed: 112 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1,74 +1,112 @@
1-
import numpy as np;
2-
from sklearn.base import BaseEstimator, ClassifierMixin;
3-
from sklearn.metrics import pairwise_distances;
4-
from sklearn.utils import check_X_y, check_array;
5-
6-
class SemiBoost(BaseEstimator, ClassifierMixin):
7-
def __init__(self, base_estimator=None, n_neighbors=5, n_estimators=100, max_iter=50, learning_rate=1.0, random_state=42):
8-
self.base_estimator = base_estimator;
9-
self.n_neighbors = n_neighbors;
10-
self.n_estimators = n_estimators;
11-
self.max_iter = max_iter;
12-
self.learning_rate = learning_rate;
13-
self.random_state = random_state;
14-
15-
def train(self, X, y):
16-
X, y = check_X_y(X, y);
17-
18-
self.classes_ = np.unique(y);
19-
self.X_ = X;
20-
self.y_ = y;
21-
22-
# Initialize weights
23-
self.weights_ = np.ones(len(y)) / len(y);
24-
25-
for iteration in range(self.max_iter):
26-
# Compute pairwise distances
27-
distances = pairwise_distances(X);
28-
np.fill_diagonal(distances, np.inf);
29-
np.nan_to_num(distances, posinf=1e10, neginf=-1e10, copy=False);
30-
31-
# Find nearest neighbors
32-
neighbors = np.argsort(distances, axis=1)[:, :self.n_neighbors];
33-
34-
# Compute similarity matrix
35-
S = np.exp(-distances ** 2 / (2. * np.var(distances)));
36-
37-
# Update weights
38-
for i in range(len(y)):
39-
for j in neighbors[i]:
40-
if y[i] == y[j]:
41-
self.weights_[i] *= np.exp(-self.learning_rate * S[i, j]);
42-
else:
43-
self.weights_[i] *= np.exp(self.learning_rate * S[i, j]);
44-
45-
# Normalize weights
46-
self.weights_ /= np.sum(self.weights_);
47-
48-
return self;
49-
50-
def predict(self, X):
51-
X = check_array(X);
52-
distances = pairwise_distances(X, self.X_);
53-
predictions = [];
54-
55-
for i in range(len(X)):
56-
weighted_votes = np.zeros(len(self.classes_));
57-
for j in range(len(self.X_)):
58-
weighted_votes[self.y_[j]] += self.weights_[j] / distances[i, j];
59-
predictions.append(self.classes_[np.argmax(weighted_votes)]);
60-
61-
return np.array(predictions);
62-
63-
def predict_proba(self, X):
64-
X = check_array(X);
65-
distances = pairwise_distances(X, self.X_);
66-
proba = [];
67-
68-
for i in range(len(X)):
69-
weighted_votes = np.zeros(len(self.classes_));
70-
for j in range(len(self.X_)):
71-
weighted_votes[self.y_[j]] += self.weights_[j] / distances[i, j];
72-
proba.append(weighted_votes / np.sum(weighted_votes));
73-
74-
return np.array(proba);
1+
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
2+
from sklearn.semi_supervised import SelfTrainingClassifier
3+
from sklearn.tree import DecisionTreeClassifier
4+
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix, roc_curve
5+
import matplotlib.pyplot as plt
6+
import seaborn as sns
7+
import pickle
8+
import os
9+
from joblib import dump
10+
11+
def load_split_data_pickle(directory='split_data_pickle'):
12+
"""
13+
Loads the split and processed data saved as pickle files.
14+
"""
15+
filenames = ['X_train.pkl', 'X_test.pkl', 'y_train.pkl', 'y_test.pkl', 'scaler.pkl']
16+
loaded_data = {}
17+
for filename in filenames:
18+
file_path = os.path.join(directory, filename)
19+
if os.path.exists(file_path):
20+
with open(file_path, 'rb') as file:
21+
loaded_data[filename] = pickle.load(file)
22+
print(f"Loaded '{filename}' from '{directory}/'")
23+
else:
24+
raise FileNotFoundError(f"'{filename}' not found in '{directory}/'")
25+
return loaded_data['X_train.pkl'], loaded_data['X_test.pkl'], loaded_data['y_train.pkl'], loaded_data['y_test.pkl'], \
26+
loaded_data['scaler.pkl']
27+
28+
29+
def save_model(model, directory='models', filename='semi_boost_model.joblib'):
30+
"""
31+
Saves the trained model using Joblib.
32+
"""
33+
os.makedirs(directory, exist_ok=True)
34+
model_path = os.path.join(directory, filename)
35+
dump(model, model_path)
36+
print(f"Model saved to '{model_path}'")
37+
38+
39+
def build_semi_boost_model(X_train, X_test, y_train, y_test):
40+
"""
41+
Builds and evaluates a SemiBoost-like model using AdaBoost with self-training.
42+
"""
43+
# Initialize base estimator as a weak learner
44+
base_estimator = DecisionTreeClassifier(max_depth=1, random_state=42)
45+
46+
# Initialize AdaBoost Classifier with the weak learner
47+
ada_clf = AdaBoostClassifier(estimator=base_estimator, n_estimators=50, random_state=42)
48+
49+
# Initialize Self-Training Classifier with AdaBoost as base
50+
self_training_clf = SelfTrainingClassifier(base_estimator=ada_clf, threshold=0.8, max_iter=10, verbose=True)
51+
52+
# Fit the model
53+
print("\nTraining SemiBoost-like Classifier...")
54+
self_training_clf.fit(X_train, y_train)
55+
56+
# Save the model
57+
save_model(self_training_clf, directory='models', filename='semi_boost_model.joblib')
58+
59+
# Predictions
60+
y_pred = self_training_clf.predict(X_test)
61+
y_pred_proba = self_training_clf.predict_proba(X_test)[:, 1]
62+
63+
# Evaluation Metrics
64+
accuracy = accuracy_score(y_test, y_pred)
65+
auc = roc_auc_score(y_test, y_pred_proba)
66+
print(f"SemiBoost Test Accuracy: {accuracy:.4f}")
67+
print(f"SemiBoost AUC-ROC Score: {auc:.4f}")
68+
print("\nClassification Report:")
69+
print(classification_report(y_test, y_pred))
70+
71+
# Confusion Matrix
72+
conf_matrix = confusion_matrix(y_test, y_pred)
73+
plt.figure(figsize=(6, 4))
74+
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Greens',
75+
xticklabels=['Class 0', 'Class 1'],
76+
yticklabels=['Class 0', 'Class 1'])
77+
plt.ylabel('Actual')
78+
plt.xlabel('Predicted')
79+
plt.title('SemiBoost Confusion Matrix')
80+
plt.show()
81+
82+
# ROC Curve
83+
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
84+
plt.figure(figsize=(6, 4))
85+
plt.plot(fpr, tpr, color='darkgreen', lw=2, label=f'ROC curve (AUC = {auc:.2f})')
86+
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
87+
plt.xlabel('False Positive Rate')
88+
plt.ylabel('True Positive Rate')
89+
plt.title('SemiBoost ROC Curve')
90+
plt.legend(loc="lower right")
91+
plt.show()
92+
93+
94+
def main():
95+
# Load data
96+
X_train, X_test, y_train, y_test, scaler = load_split_data_pickle()
97+
98+
# For SemiBoost-like approach, assume some unlabeled data
99+
import numpy as np
100+
unlabeled_fraction = 0.1
101+
n_unlabeled = int(len(y_train) * unlabeled_fraction)
102+
np.random.seed(42)
103+
unlabeled_indices = np.random.choice(y_train.index, n_unlabeled, replace=False)
104+
y_train_unlabeled = y_train.copy()
105+
y_train_unlabeled.loc[unlabeled_indices] = -1 # Mark as unlabeled
106+
107+
# Build and evaluate the SemiBoost-like model
108+
build_semi_boost_model(X_train, X_test, y_train_unlabeled, y_test)
109+
110+
111+
if __name__ == "__main__":
112+
main()

0 commit comments

Comments
 (0)