参考论文:
An, Jinwon, and Sungzoon Cho. “Variational autoencoder based anomaly detection using reconstruction probability.” Special Lecture on IE 2.1 (2015): 1-18.
整体的算法思路
AutoEncoder的模型与pytorch建模可以参考:
- 将正常样本与异常样本切分为:训练集X,训练集Y,测试集X,测试集Y
- AutoEncoder建模:建模
- 用正样本数据训练AutoEncoder:因为AutoEncoder是要想办法复现原有数据,因此要确保AutoEncoder看到的都只是自身正常的数据,这样当异常的数据到来时,就会出现很突兀的状况,这也是我们要的效果。
- 计算阈值:这是很多研究讨论的点。因为异常样本会造成很突兀的效果,但是突兀的程度有多大,我们认为是异常样本,就是这里要明确的
- 让样本通过Autoencoder模型,复现后的Loss超过阈值,就被认为是异常样本,没有超过阈值则为正常样本
示例代码
在下面的代码中,步骤从上到下,通过def main()
函数串联整个流程,Debug效果更加
import numpy as np
from pyod.utils.data import generate_data
import torch
import torch.nn as nn
import torch.utils.data as Data
from loguru import logger
from sklearn.metrics import accuracy_score, recall_score
# 步骤1:获取数据源
def get_data():
"""生成数据"""
X_train, X_test, y_train, y_test = generate_data(
n_train=10000, n_test=1000, n_features=50, contamination=0.2, behaviour='new') # 这里生产数据
return X_train, X_test, y_train, y_test
# 步骤2:AutoEncoder建模,这里根据自己的数据调整模型,AutoEncoder不是一个固定的模型
class AutoEncoder(nn.Module):
def __init__(self, input_size, hidden_layer_size=32):
super().__init__()
self.hidden_layer_size = hidden_layer_size
# 输入与输出的维度相同
self.input_size = input_size
self.output_size = input_size
self.encode_linear = nn.Linear(self.input_size, hidden_layer_size)
self.decode_linear = nn.Linear(hidden_layer_size, self.output_size)
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
def forward(self, input_x):
# encode
encode_linear = self.encode_linear(input_x)
encode_out = self.sigmoid(encode_linear)
# decode
decode_linear = self.decode_linear(encode_out) # =self.linear(lstm_out[:, -1, :])
return decode_linear
# 步骤3:训练模型
def train_auto_encoder(normal_data: np.ndarray):
"""训练Auto Encoder模型"""
train_tensor = torch.tensor(normal_data).float()
batch_size = 20
train_loader = Data.DataLoader(
dataset=Data.TensorDataset(train_tensor), # 封装进Data.TensorDataset()类的数据,可以为任意维度
batch_size=batch_size, # 每块的大小
shuffle=True, # 要不要打乱数据 (打乱比较好)
num_workers=2, # 多进程(multiprocess)来读数据
)
# 建模三件套:loss,优化,epochs
model = AutoEncoder(train_tensor.shape[1]) # 模型
loss_function = nn.MSELoss() # loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001) # 优化器
epochs = 30
# 开始训练
model.train()
loss_list = []
for i in range(epochs):
epoch_loss_list = []
for seq in train_loader:
seq = seq[0]
optimizer.zero_grad()
y_pred = model(seq).squeeze() # 压缩维度:得到输出,并将维度为1的去除
single_loss = loss_function(y_pred, seq)
single_loss.backward()
optimizer.step()
epoch_loss_list.append(single_loss.detach().numpy())
logger.debug("Train Step:{} loss: {}", i, np.mean(epoch_loss_list))
loss_list.append(np.mean(epoch_loss_list))
return model, np.min(loss_list)
# 步骤4:模型训练完成,开始找正常数据与异常数据的阈值
def set_threshold(auto_encoder_model, error_data: np.ndarray, normal_loss):
"""阈值计算:这里通过正常数据的最小loss与最大异常的loss取中间值,作为阈值的loss"""
error_tensor = torch.tensor(error_data).float()
error_loader = Data.DataLoader(
dataset=Data.TensorDataset(error_tensor),
shuffle=True,
num_workers=2,
)
loss_function = nn.MSELoss()
error_loss = []
auto_encoder_model.eval()
for seq in error_loader:
seq = seq[0]
y_pred = auto_encoder_model(seq).squeeze()
loss = loss_function(y_pred, seq)
error_loss.append(loss.detach().numpy())
threshold = (np.min(error_loss) - normal_loss) / 2
logger.info("阈值计算完毕:正常样本的最大loss:{},异常样本的最小loss:{},阈值:{}", normal_loss, np.min(error_loss), threshold)
return threshold
# 步骤5:异常检测
def anomaly_detection(auto_encoder_model, threshold, x_test, y_test):
# 这里构建测试数据
test_tensor = torch.tensor(x_test).float()
test_loader = Data.DataLoader(
dataset=Data.TensorDataset(test_tensor),
shuffle=True,
num_workers=2,
)
pred_list = []
loss_function = nn.MSELoss()
# 通过 Auto Encoder 模型进行预测
auto_encoder_model.eval()
for seq in test_loader:
seq = seq[0]
y_pred = auto_encoder_model(seq).squeeze()
loss = loss_function(y_pred, seq)
if loss < threshold:
pred_list.append(0)
else:
pred_list.append(1) # 异常
# 评价指标
logger.info("准确率:{} 召回率:{}", accuracy_score(y_test, pred_list), recall_score(y_test, pred_list))
def main():
x_train, x_test, y_train, y_test = get_data()
model, normal_loss = train_auto_encoder(x_train[y_train == 0]) # 拿正例数据进行训练
threshold = set_threshold(model, x_train[y_train == 1], normal_loss)
anomaly_detection(model, threshold, x_test, y_test)
if __name__ == '__main__':
main()
打印的日志:
2022-03-25 16:22:45.409 | DEBUG | __main__:train_auto_encoder:93 - Train Step:0 loss: 7.666098117828369
2022-03-25 16:22:47.507 | DEBUG | __main__:train_auto_encoder:93 - Train Step:1 loss: 0.2303541898727417
2022-03-25 16:22:49.653 | DEBUG | __main__:train_auto_encoder:93 - Train Step:2 loss: 0.22991487383842468
2022-03-25 16:22:51.769 | DEBUG | __main__:train_auto_encoder:93 - Train Step:3 loss: 0.2299502193927765
2022-03-25 16:22:53.881 | DEBUG | __main__:train_auto_encoder:93 - Train Step:4 loss: 0.23001298308372498
2022-03-25 16:22:56.002 | DEBUG | __main__:train_auto_encoder:93 - Train Step:5 loss: 0.23006229102611542
2022-03-25 16:22:58.057 | DEBUG | __main__:train_auto_encoder:93 - Train Step:6 loss: 0.23018917441368103
2022-03-25 16:23:00.111 | DEBUG | __main__:train_auto_encoder:93 - Train Step:7 loss: 0.23030006885528564
2022-03-25 16:23:02.169 | DEBUG | __main__:train_auto_encoder:93 - Train Step:8 loss: 0.23039482533931732
2022-03-25 16:23:04.224 | DEBUG | __main__:train_auto_encoder:93 - Train Step:9 loss: 0.23047684133052826
2022-03-25 16:23:06.284 | DEBUG | __main__:train_auto_encoder:93 - Train Step:10 loss: 0.23054634034633636
2022-03-25 16:23:08.323 | DEBUG | __main__:train_auto_encoder:93 - Train Step:11 loss: 0.23061569035053253
2022-03-25 16:23:10.347 | DEBUG | __main__:train_auto_encoder:93 - Train Step:12 loss: 0.23073334991931915
2022-03-25 16:23:12.372 | DEBUG | __main__:train_auto_encoder:93 - Train Step:13 loss: 0.22996807098388672
2022-03-25 16:23:14.427 | DEBUG | __main__:train_auto_encoder:93 - Train Step:14 loss: 0.2291102260351181
2022-03-25 16:23:16.485 | DEBUG | __main__:train_auto_encoder:93 - Train Step:15 loss: 0.2279823124408722
2022-03-25 16:23:18.540 | DEBUG | __main__:train_auto_encoder:93 - Train Step:16 loss: 0.22661711275577545
2022-03-25 16:23:20.598 | DEBUG | __main__:train_auto_encoder:93 - Train Step:17 loss: 0.22481602430343628
2022-03-25 16:23:22.646 | DEBUG | __main__:train_auto_encoder:93 - Train Step:18 loss: 0.221832737326622
2022-03-25 16:23:24.710 | DEBUG | __main__:train_auto_encoder:93 - Train Step:19 loss: 0.2177421599626541
2022-03-25 16:23:26.776 | DEBUG | __main__:train_auto_encoder:93 - Train Step:20 loss: 0.21366573870182037
2022-03-25 16:23:28.779 | DEBUG | __main__:train_auto_encoder:93 - Train Step:21 loss: 0.2092127650976181
2022-03-25 16:23:30.764 | DEBUG | __main__:train_auto_encoder:93 - Train Step:22 loss: 0.2051170915365219
2022-03-25 16:23:32.736 | DEBUG | __main__:train_auto_encoder:93 - Train Step:23 loss: 0.19992759823799133
2022-03-25 16:23:34.755 | DEBUG | __main__:train_auto_encoder:93 - Train Step:24 loss: 0.19542557001113892
2022-03-25 16:23:36.771 | DEBUG | __main__:train_auto_encoder:93 - Train Step:25 loss: 0.1883782595396042
2022-03-25 16:23:38.757 | DEBUG | __main__:train_auto_encoder:93 - Train Step:26 loss: 0.1813327670097351
2022-03-25 16:23:40.760 | DEBUG | __main__:train_auto_encoder:93 - Train Step:27 loss: 0.17500406503677368
2022-03-25 16:23:42.744 | DEBUG | __main__:train_auto_encoder:93 - Train Step:28 loss: 0.16993017494678497
2022-03-25 16:23:44.727 | DEBUG | __main__:train_auto_encoder:93 - Train Step:29 loss: 0.16403226554393768
2022-03-25 16:23:47.161 | INFO | __main__:set_threshold:116 - 阈值计算完毕:正常样本的最大loss:0.16403226554393768,异常样本的最小loss:15.091814041137695,阈值:7.46389102935791
2022-03-25 16:23:49.223 | INFO | __main__:anomaly_detection:142 - 准确率:0.694 召回率:0.235