library(ggplot2)
data = read.table('bank2.dat')
colnames(data) = c('X1','X2','X3','X4','X5','X6')
data = data.frame(data, genuine = c(rep(1, 100), rep(0, 100)))
attach(data)
下列物件被遮斷自 data (pos = 6):
genuine, X1, X2, X3, X4, X5, X6
下列物件被遮斷自 data (pos = 8):
genuine, X1, X2, X3, X4, X5, X6
下列物件被遮斷自 data (pos = 9):
genuine, X1, X2, X3, X4, X5, X6
下列物件被遮斷自 data (pos = 10):
genuine, X1, X2, X3, X4, X5, X6
下列物件被遮斷自 data (pos = 16):
genuine, X1, X2, X3, X4, X5, X6
head(data)
print('Quartiles:')
[1] "Quartiles:"
summary(data)
X1 X2 X3 X4 X5 X6 genuine
Min. :213.8 Min. :129.0 Min. :129.0 Min. : 7.200 Min. : 7.70 Min. :137.8 Min. :0.0
1st Qu.:214.6 1st Qu.:129.9 1st Qu.:129.7 1st Qu.: 8.200 1st Qu.:10.10 1st Qu.:139.5 1st Qu.:0.0
Median :214.9 Median :130.2 Median :130.0 Median : 9.100 Median :10.60 Median :140.4 Median :0.5
Mean :214.9 Mean :130.1 Mean :130.0 Mean : 9.418 Mean :10.65 Mean :140.5 Mean :0.5
3rd Qu.:215.1 3rd Qu.:130.4 3rd Qu.:130.2 3rd Qu.:10.600 3rd Qu.:11.20 3rd Qu.:141.5 3rd Qu.:1.0
Max. :216.3 Max. :131.0 Max. :131.1 Max. :12.700 Max. :12.30 Max. :142.4 Max. :1.0
print('NAs:')
[1] "NAs:"
sapply(data, function(x) sum(is.na(x)))
X1 X2 X3 X4 X5 X6 genuine
0 0 0 0 0 0 0
library(ggcorrplot)
cor_matrix = cor(data)
ggcorrplot(cor_matrix,
method = "circle",
type = "full",
lab = TRUE,
lab_size = 3,
colors = c("blue", "white", "red"),
outline.color = "gray",
legend.title = "Correlation",
show.legend = TRUE)
pairs(data, upper.panel = NULL)
pairs_plot = function(x, y, x_name, y_name){
par(mar = c(5, 5, 4, 10))
plot(x, y, main = "Swiss bank notes", xlab = x_name, ylab = y_name)
points(x[1:100], y[1:100], col = "blue")
points(x[101:200], y[101:200], col = "red")
legend("topright", legend = c("Genuine", "Counterfeit"), col = c("blue", "red"), pch = 1, xpd = TRUE, inset = c(-0.3, 0))
#ggplot(data.frame(X5, X6), aes(x = X5, y = X6, color = genuine)) +
#geom_point() +
#labs(title = "Swiss bank notes")
}
num = combn(1:6, 2)
for (i in 1:ncol(num)) {
x = paste0("X", num[1, ][i])
y = paste0("X", num[2, ][i])
pairs_plot(get(x), get(y), x, y)
}
NA
NA
name = NULL
for (i in 1:6) {
name = c(name, paste0("X", i, "_Genuine"))
name = c(name, paste0("X", i, "_Counterfeit"))
}
groups = list()
for (i in 1:length(name)){
if (i %% 2 == 1) {
groups[[name[i]]] = data[1:100, as.integer(i / 2) + 1]
} else {
groups[[name[i]]] = data[101:200, as.integer(i / 2)]
}
}
means <- sapply(groups, mean)
par(mfrow=c(1,3))
for (i in seq(1, length(name) ,by = 2)){
boxplot(groups[i:(i + 1)], names = name[i:(i + 1)], frame = TRUE, main = "Swiss bank notes", cex.axis=0.99)
for (j in 0:1) {
lines(c(j + 0.6, j + 1.4), rep(means[[name[(i + j)]]], 2), lty = "dotted", lwd = 1.2)
}
}
data_Genuine = data[1:100, 1:6]
data_Counterfeit = data[101:200, 1:6]
for (i in 1:6) {
breaks_seq = seq(min(data[, i]), max(data[, i]), by = 0.1)
min_x = floor(min(data[, i]))
max_x = ceiling(max(data[, i]))
hist(data_Genuine[, i],
breaks = breaks_seq,
col = rgb(0, 0, 1, 0.5),
border = "blue",
xlim = c(min_x, max_x),
ylim = c(0, max(table(cut(data[, i], breaks_seq))) + 5),
main = "Histogram",
xlab = paste0("X", i),
axes = FALSE)
hist(data_Counterfeit[, i],
breaks = breaks_seq,
col = rgb(1, 0, 0, 0.5),
border = "red",
add = TRUE)
axis(side = 1, at = seq(min_x, max_x, by = 1))
axis(side = 2)
legend("topright",
legend = c("Genuine", "Counterfeit", "Overlap"),
fill = c(rgb(0, 0, 1, 0.5), rgb(1, 0, 0, 0.5), rgb(0.6, 0, 0.4, 0.7)),
border = c("blue", "red", "purple"))
}
data_Genuine = data[data$genuine == 1, 1:6]
data_Counterfeit = data[data$genuine == 0, 1:6]
overlap_ratios = numeric(6)
par(mfrow = c(1, 1))
num_vars = ncol(data_Genuine)
for (i in 1:num_vars) {
breaks_seq = seq(min(data[, i]), max(data[, i]), by = 0.1)
min_x = floor(min(data[, i]))
max_x = ceiling(max(data[, i]))
hist_Genuine = hist(data_Genuine[, i], breaks = breaks_seq, plot = FALSE)
hist_Counterfeit = hist(data_Counterfeit[, i], breaks = breaks_seq, plot = FALSE)
overlap_counts = pmin(hist_Genuine$counts, hist_Counterfeit$counts)
overlap_ratio = sum(overlap_counts) / sum(hist_Counterfeit$counts)
overlap_ratios[i] = overlap_ratio
plot_title = paste0("Histogram of ", colnames(data)[i], " (Overlap: ", round(overlap_ratio * 100, 2), "%)")
hist(data_Genuine[, i],
breaks = breaks_seq,
col = rgb(0, 0, 1, 0.5),
border = "blue",
xlim = c(min_x, max_x),
ylim = c(0, max(c(hist_Genuine$counts, hist_Counterfeit$counts)) + 5),
main = plot_title,
xlab = colnames(data)[i],
axes = FALSE)
hist(data_Counterfeit[, i],
breaks = breaks_seq,
col = rgb(1, 0, 0, 0.5),
border = "red",
add = TRUE)
axis(side = 1, at = seq(min_x, max_x, by = 1))
axis(side = 2)
legend("topright",
legend = c("Genuine", "Counterfeit", "Overlap"),
fill = c(rgb(0, 0, 1, 0.5), rgb(1, 0, 0, 0.5), rgb(0.6, 0, 0.4, 0.7)),
border = c("blue", "red", "purple"))
}
library(KernSmooth)
for (i in 1:6) {
data_Genuine = data[1:100, i]
data_Counterfeit = data[101:200, i]
fh1 = bkde(data_Genuine, kernel = "biweight")
fh2 = bkde(data_Counterfeit, kernel = "biweight")
x_min = floor(min(c(fh1$x, fh2$x)))
x_max = ceiling(max(c(fh1$x, fh2$x)))
y_max = max(c(fh1$y, fh2$y)) * 1.1
x_ticks <- seq(x_min, x_max, by = 1)
plot(fh1, type = "l", lwd = 2,
xlab = "Counterfeit / Genuine",
ylab = paste0("Density estimates for X", i, sep = ""),
col = "blue", main = paste0("Swiss bank notes - X", i),
xlim = c(x_min, x_max), ylim = c(0, y_max),
axes = FALSE)
lines(fh2, lty = "dotted", lwd = 2, col = "red")
axis(side = 1, at = x_ticks, labels = x_ticks)
axis(side = 2)
box()
legend("topright",
legend = c("Genuine", "Counterfeit"),
col = c("blue", "red"),
lty = c("solid", "dotted"),
lwd = 2,
cex = 0.8)
}
library(lattice)
library(grid)
groups = data[, 7]
pch_types = c(1, 1)
colors = c("blue", "red")
pch_group = pch_types[ifelse(groups == 0, 1, 2)]
col_group = colors[ifelse(groups == 0, 1, 2)]
combos = combn(1:6, 3)
n_rows = 1
n_cols = 1
plots_per_page = n_rows * n_cols
for (page in seq(1, ncol(combos), by = plots_per_page)) {
grid.newpage()
for (i in 0:(plots_per_page - 1)) {
index = page + i
if (index > ncol(combos)) break
x_name = paste0("X", combos[1, index])
y_name = paste0("X", combos[2, index])
z_name = paste0("X", combos[3, index])
x_data = data[[x_name]]
y_data = data[[y_name]]
z_data = data[[z_name]]
p = cloud(z_data ~ x_data * y_data,
pch = pch_group,
col = col_group,
cex = 1.2,
ticktype = "detailed",
main = paste("Swiss bank notes:", x_name, y_name, z_name),
screen = list(z = -90, x = -90, y = 45),
scales = list(arrows = FALSE, col = "black", distance = 1, cex = 0.5),
xlab = list(x_name, rot = -10, cex = 1.2),
ylab = list(y_name, rot = 10, cex = 1.2),
zlab = list(z_name, rot = 90, cex = 1.1),
par.settings = list(
axis.line = list(col = "black"),
box.3d = list(
col = c(
"black", "transparent", "transparent","black", "black", "transparent", "transparent", "transparent",
"transparent", "transparent", "transparent", "transparent"))),
key = list(
space = "right",
points = list(pch = pch_types, col = colors, cex = 1.5),
text = list(c("Genuine", "Counterfeit")),
border = FALSE
)
)
print(p, split = c((i %% n_cols) + 1, (i %/% n_cols) + 1, n_cols, n_rows), more = TRUE)
}
}
library(plotly)
library(htmlwidgets)
# 假設你的 data 是一個 data.frame,包含 X1, X2, X6 和 groups 列
# 取出 X1, X2 和 X6 變數
x_data = data$X1
y_data = data$X2
z_data = data$X6
groups = data[, 7]
# 將 groups 轉換為因子,這樣我們可以用顏色區分
groups = factor(groups, levels = c(0, 1), labels = c("Genuine", "Counterfeit"))
# 計算 x, y, z 軸的範圍
x_range <- range(x_data)
y_range <- range(y_data)
z_range <- range(z_data)
# 將 x, y, z 軸數據縮放到 [0, 1] 範圍
x_scaled = (x_data - x_range[1]) / (x_range[2] - x_range[1])
y_scaled = (y_data - y_range[1]) / (y_range[2] - y_range[1])
z_scaled = (z_data - z_range[1]) / (z_range[2] - z_range[1])
# 計算刻度點,並將其四捨五入到小數點後 1 位
tick_positions = seq(0, 1, length.out = 5)
tick_labels_x = round(seq(x_range[1], x_range[2], length.out = 5), 1)
tick_labels_y = round(seq(y_range[1], y_range[2], length.out = 5), 1)
tick_labels_z = round(seq(z_range[1], z_range[2], length.out = 5), 1)
# 每個點的 hover 標籤,顯示原始數據值
hover_text <- paste0(
"X1: ", round(x_data, 1), "<br>",
"X2: ", round(y_data, 1), "<br>",
"X6: ", round(z_data, 1)
)
# 使用 plotly 生成 3D 散點圖
fig <- plot_ly(
x = x_scaled,
y = y_scaled,
z = z_scaled,
color = groups,
colors = c("blue", "red"),
type = "scatter3d",
mode = "markers",
marker = list(size = 5),
text = hover_text, # 設定 hover 的文字
hoverinfo = "text" # 顯示 text 而不是縮放後的座標
)
# 添加標題和標籤,並設定軸範圍與標籤
fig <- fig %>% layout(
title = "3D Scatter Plot of X1, X2, X6",
scene = list(
xaxis = list(title = "X1", range = c(0, 1), tickvals = tick_positions, ticktext = tick_labels_x),
yaxis = list(title = "X2", range = c(0, 1), tickvals = tick_positions, ticktext = tick_labels_y),
zaxis = list(title = "X6", range = c(0, 1), tickvals = tick_positions, ticktext = tick_labels_z)
)
)
# 輸出為 HTML 檔案
saveWidget(fig, "3d_scatter_plot.html")
library(tourr)
# x:全部 200 筆資料
x = data[1:200, ]
# 初始化 y(做 zero-one scaling)
y = NULL
for (i in 1:6) {
z = (x[, i] - min(x[, i])) / (max(x[, i]) - min(x[, i])) # zero-one scaling
y = cbind(y, z)
}
# 定義類別
Type = data[, 7] # 前 100 筆為 Type 1,後 100 筆為 Type 2
f = as.integer(Type)
# 定義 t 軸範圍
grid = seq(0, 2 * pi, length = 1000)
# 初始化 plot,畫第一筆曲線
plot(grid,
andrews(y[1, ])(grid),
type = "l",
lwd = 1.2,
main = "Andrews' curves (All Bank data)",
axes = FALSE,
frame = TRUE,
ylim = c(-0.5, 0.6),
ylab = "",
xlab = "",
col = ifelse(f[1] == 1, "blue", "red"),
lty = ifelse(f[1] == 1, "solid", "dotted"))
# 繪製剩餘曲線
for (i in 2:200) {
lines(grid,
andrews(y[i, ])(grid),
col = ifelse(f[i] == 1, "blue", "red"),
lwd = 1.2,
lty = ifelse(f[i] == 1, "solid", "dotted"))
}
# 增加座標軸
axis(side = 2, at = seq(-0.5, 0.6, 0.2), labels = seq(-0.5, 0.6, 0.2))
axis(side = 1, at = seq(0, 7, 1), labels = seq(0, 7, 1))
# 加上圖例
legend("topright",
legend = c("Genuine", "Counterfeit"),
col = c("blue", "red"),
lwd = 1.5,
lty = c("solid", "dotted"))
#Logistic Regression
library(caTools)
library(ROCR)
# summary
set.seed(123456)
split = sample.split(data,SplitRatio = 0.7)
train_reg = subset(data, split =='TRUE')
test_reg = subset(data, split == 'FALSE')
logistic_model = glm(factor(genuine) ~ X1+X2+X3+X4+X5+X6, data = train_reg,
family = binomial(link = 'logit'),
control = list(maxit=1000))
警告: glm.fit:擬合機率算出來是數值零或一
logistic_model
Call: glm(formula = factor(genuine) ~ X1 + X2 + X3 + X4 + X5 + X6,
family = binomial(link = "logit"), data = train_reg, control = list(maxit = 1000))
Coefficients:
(Intercept) X1 X2 X3 X4 X5 X6
-5619.203 15.488 4.479 -17.391 -15.337 -22.686 30.986
Degrees of Freedom: 113 Total (i.e. Null); 107 Residual
Null Deviance: 158
Residual Deviance: 4.51e-10 AIC: 14
summary(logistic_model)
Call:
glm(formula = factor(genuine) ~ X1 + X2 + X3 + X4 + X5 + X6,
family = binomial(link = "logit"), data = train_reg, control = list(maxit = 1000))
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -5.619e+03 2.773e+08 0 1
X1 1.549e+01 2.267e+05 0 1
X2 4.479e+00 8.012e+05 0 1
X3 -1.739e+01 1.287e+06 0 1
X4 -1.534e+01 3.596e+05 0 1
X5 -2.269e+01 1.828e+05 0 1
X6 3.099e+01 1.682e+05 0 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1.5800e+02 on 113 degrees of freedom
Residual deviance: 4.5102e-10 on 107 degrees of freedom
AIC: 14
Number of Fisher Scoring iterations: 27
predict_reg = predict(logistic_model,test_reg,type='response')
predict_reg
1 2 7 8 9 14 15 16
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
21 22 23 28 29 30 35 36
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
37 42 43 44 49 50 51 56
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
57 58 63 64 65 70 71 72
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 8.748549e-11 9.986445e-01 1.000000e+00
77 78 79 84 85 86 91 92
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
93 98 99 100 105 106 107 112
1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
113 114 119 120 121 126 127 128
2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
133 134 135 140 141 142 147 148
2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
149 154 155 156 161 162 163 168
2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
169 170 175 176 177 182 183 184
2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
189 190 191 196 197 198
2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16 2.220446e-16
predict_reg = ifelse(predict_reg >0.5,1,0)
table(test_reg$genuine,predict_reg)
predict_reg
0 1
0 42 0
1 1 43
missing_classerr = mean(predict_reg!=test_reg$genuine)
print(paste('Accuracy=',1-missing_classerr))
[1] "Accuracy= 0.988372093023256"
ROCPred = prediction(predict_reg, test_reg$genuine)
ROCPer = performance(ROCPred, measure = 'tpr',x.measure = 'fpr')
auc = performance(ROCPred,measure='auc')
auc = auc@y.values[[1]]
auc
[1] 0.9886364
plot(ROCPer)
plot(ROCPer,colourise=TRUE,print.cuttoffs.at=seq(0.1,by=0.1), col=2, lwd=2,
main='ROC Curve')
abline(a=0,b=1)
auc = round(auc,4)
legend(.6,.4, auc,title = 'AUC',cex=1)
library(randomForest)
# summary
set.seed(12345)
data$genuine = as.factor(data$genuine)
split = sample.split(data,SplitRatio = 0.6)
train_rf = subset(data, split =='TRUE')
test_rf = subset(data, split == 'FALSE')
rf = randomForest(genuine~ X1+X2+X3+X4+X5+X6,data =train_rf,ntree=500)
print(rf)
Call:
randomForest(formula = genuine ~ X1 + X2 + X3 + X4 + X5 + X6, data = train_rf, ntree = 500)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 2
OOB estimate of error rate: 0.88%
Confusion matrix:
0 1 class.error
0 56 0 0.00000000
1 1 56 0.01754386
set.seed(12345)
importance(rf)
MeanDecreaseGini
X1 1.078579
X2 2.833452
X3 4.902606
X4 14.906531
X5 5.751345
X6 26.515655
varImpPlot(rf)
pred1 = predict(rf,test_rf, type='prob')
perf = prediction(pred1[,2], test_rf$genuine)
auc = performance(perf, 'auc')
auc_value = auc@y.values[[1]] # Extract the numeric AUC value
pred3 = performance(perf, 'tpr', 'fpr')
# Plot the ROC curve
plot(pred3, main="ROC Curve for Random Forest", col=2, lwd=2)
abline(a=0, b=1, lwd=2, lty=2, col='gray')
# Add AUC value to the plot
text(0.6, 0.2, paste("AUC =", round(auc_value, 3)), col=2, cex=1.2)