-
Notifications
You must be signed in to change notification settings - Fork 0
/
6-5 PSO Fuzzy XGB NGN Features.py
159 lines (131 loc) · 6.2 KB
/
6-5 PSO Fuzzy XGB NGN Features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
import matplotlib.pyplot as plt
from pyswarm import pso
class NeuralGas:
def __init__(self, n_units=3, max_iter=1000, eta_start=0.5, eta_end=0.01, lambda_start=30, lambda_end=0.1):
self.n_units = n_units
self.max_iter = max_iter
self.eta_start = eta_start
self.eta_end = eta_end
self.lambda_start = lambda_start
self.lambda_end = lambda_end
self.units = None
self.feature_importance = None
def _update_learning_rate(self, i):
return self.eta_start * (self.eta_end / self.eta_start) ** (i / self.max_iter)
def _update_neighborhood_range(self, i):
return self.lambda_start * (self.lambda_end / self.lambda_start) ** (i / self.max_iter)
def train(self, data):
n_samples, n_features = data.shape
self.units = np.random.rand(self.n_units, n_features)
self.feature_importance = np.zeros(n_features)
for i in range(self.max_iter):
eta = self._update_learning_rate(i)
lambd = self._update_neighborhood_range(i)
indices = np.random.permutation(n_samples)
for index in indices:
x = data[index]
dists = np.linalg.norm(self.units - x, axis=1)
ranking = np.argsort(dists)
for rank, idx in enumerate(ranking):
influence = np.exp(-rank / lambd)
self.units[idx] += eta * influence * (x - self.units[idx])
print(f"NGN Training Iteration: {i+1}/{self.max_iter}")
def get_feature_importance(self):
return self.feature_importance / np.sum(self.feature_importance)
# Load data
data_df = pd.read_csv('tr.csv')
features = data_df.iloc[:, :-1]
labels = data_df.iloc[:, -1] - 1
# Normalize features
scaler = StandardScaler()
features_normalized = scaler.fit_transform(features)
features_normalized_df = pd.DataFrame(features_normalized, columns=features.columns)
# Initialize and train Neural Gas
ng = NeuralGas(n_units=9, max_iter=100)
ng.train(features_normalized)
# Get feature importance and select top features
feature_importance = ng.get_feature_importance()
top_features_indices = np.argsort(feature_importance)[-22:]
features_top = features_normalized_df.iloc[:, top_features_indices]
# Fuzzify selected features
def fuzzify_features(features_df):
for column in features_df.columns:
low = features_df[column].quantile(0.33)
high = features_df[column].quantile(0.67)
features_df[column] = features_df[column].apply(lambda x: 0 if x < low else (1 if x < high else 2))
return features_df
features_fuzzified_df = fuzzify_features(features_top.copy())
# Split data
X_train, X_test, y_train, y_test = train_test_split(features_fuzzified_df, labels, test_size=0.3, random_state=42)
import matplotlib.pyplot as plt
import xgboost as xgb
from pyswarm import pso
from sklearn.metrics import accuracy_score
# List to store the best loss values at each iteration
best_cost_history = []
# Define the PSO objective function for XGBoost
def xgb_objective(params):
global best_accuracy
global best_cost_history
max_depth, learning_rate = int(params[0]), params[1]
model = xgb.XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, num_class=4, objective='multi:softmax', verbosity=1, seed=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
loss = 1 - accuracy
if best_cost_history:
# Ensure the best cost history list always contains the minimum loss seen so far
best_cost_history.append(min(loss, best_cost_history[-1]))
else:
# Initialize the list with the first loss if empty
best_cost_history.append(loss)
if accuracy > best_accuracy:
best_accuracy = accuracy
print(f"New best parameters: max_depth={max_depth}, learning_rate={learning_rate}, accuracy={accuracy:.4f}")
return loss
# Initialize the best accuracy
best_accuracy = 0
# PSO bounds
lb = [3, 0.01]
ub = [10, 0.3]
# Perform PSO
xopt, fopt = pso(xgb_objective, lb, ub, swarmsize=7, maxiter=100)
# Train the model with optimized parameters
model_opt = xgb.XGBClassifier(max_depth=int(xopt[0]), learning_rate=xopt[1], num_class=4, objective='multi:softmax', verbosity=1, seed=42)
model_opt.fit(X_train, y_train)
y_pred_opt = model_opt.predict(X_test)
# Evaluate and print optimized accuracy
accuracy_opt = accuracy_score(y_test, y_pred_opt)
print(f'Optimized Test Accuracy: {accuracy_opt:.4f}')
# Plot the best cost history over iterations
plt.figure(figsize=(10, 6))
plt.plot(best_cost_history, label='Best Cost per Iteration', color='blue', linewidth=2) # Thicker curve
plt.title('PSO Best Cost Over Iterations', fontweight='bold', fontsize=14) # Adjustable font size for the title
plt.xlabel('Iteration', fontweight='bold', fontsize=14) # Bold and adjustable font size for X-axis
plt.ylabel('Loss', fontweight='bold', fontsize=14) # Bold and adjustable font size for Y-axis
plt.xticks(fontweight='bold', fontsize=14) # Bold and adjustable font size for X-axis ticks
plt.yticks(fontweight='bold', fontsize=14) # Bold and adjustable font size for Y-axis ticks
plt.legend()
plt.grid(True)
plt.show()
import seaborn as sns
from sklearn.metrics import confusion_matrix
import numpy as np
# Compute the confusion matrix for the optimized predictions
cm = confusion_matrix(y_test, y_pred_opt)
cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# Plotting the confusion matrix by percentage
plt.figure(figsize=(7, 5))
sns.heatmap(cm_percentage, annot=True, fmt=".2%", cmap='Blues', annot_kws={"size": 12, "weight": "bold"})
plt.title('Confusion Matrix by Percentage - Optimized Model', size=14, weight='bold')
plt.ylabel('Actual Label', size=12, weight='bold')
plt.xlabel('Predicted Label', size=12, weight='bold')
plt.xticks(fontsize=10, fontweight='bold')
plt.yticks(fontsize=10, fontweight='bold')
plt.show()