参考论文:
An, Jinwon, and Sungzoon Cho. “Variational autoencoder based anomaly detection using reconstruction probability.” Special Lecture on IE 2.1 (2015): 1-18.

整体的算法思路

AutoEncoder的模型与pytorch建模可以参考:

  1. 将正常样本与异常样本切分为:训练集X,训练集Y,测试集X,测试集Y
  2. AutoEncoder建模:建模
  3. 用正样本数据训练AutoEncoder:因为AutoEncoder是要想办法复现原有数据,因此要确保AutoEncoder看到的都只是自身正常的数据,这样当异常的数据到来时,就会出现很突兀的状况,这也是我们要的效果。
  4. 计算阈值:这是很多研究讨论的点。因为异常样本会造成很突兀的效果,但是突兀的程度有多大,我们认为是异常样本,就是这里要明确的
  5. 让样本通过Autoencoder模型,复现后的Loss超过阈值,就被认为是异常样本,没有超过阈值则为正常样本

示例代码

在下面的代码中,步骤从上到下,通过def main()函数串联整个流程,Debug效果更加

import numpy as np
from pyod.utils.data import generate_data
import torch
import torch.nn as nn
import torch.utils.data as Data
from loguru import logger
from sklearn.metrics import accuracy_score, recall_score


# 步骤1:获取数据源
def get_data():
    """生成数据"""
    X_train, X_test, y_train, y_test = generate_data(
        n_train=10000, n_test=1000, n_features=50, contamination=0.2, behaviour='new')  # 这里生产数据
    return X_train, X_test, y_train, y_test


# 步骤2:AutoEncoder建模,这里根据自己的数据调整模型,AutoEncoder不是一个固定的模型
class AutoEncoder(nn.Module):
    def __init__(self, input_size, hidden_layer_size=32):
        super().__init__()
        self.hidden_layer_size = hidden_layer_size
        # 输入与输出的维度相同
        self.input_size = input_size
        self.output_size = input_size

        self.encode_linear = nn.Linear(self.input_size, hidden_layer_size)
        self.decode_linear = nn.Linear(hidden_layer_size, self.output_size)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_x):
        # encode
        encode_linear = self.encode_linear(input_x)
        encode_out = self.sigmoid(encode_linear)
        # decode
        decode_linear = self.decode_linear(encode_out)  # =self.linear(lstm_out[:, -1, :])
        return decode_linear


# 步骤3:训练模型
def train_auto_encoder(normal_data: np.ndarray):
    """训练Auto Encoder模型"""
    train_tensor = torch.tensor(normal_data).float()
    batch_size = 20

    train_loader = Data.DataLoader(
        dataset=Data.TensorDataset(train_tensor),  # 封装进Data.TensorDataset()类的数据,可以为任意维度
        batch_size=batch_size,  # 每块的大小
        shuffle=True,  # 要不要打乱数据 (打乱比较好)
        num_workers=2,  # 多进程(multiprocess)来读数据
    )
    # 建模三件套:loss,优化,epochs
    model = AutoEncoder(train_tensor.shape[1])  # 模型
    loss_function = nn.MSELoss()  # loss
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # 优化器
    epochs = 30
    # 开始训练
    model.train()
    loss_list = []
    for i in range(epochs):
        epoch_loss_list = []
        for seq in train_loader:
            seq = seq[0]
            optimizer.zero_grad()
            y_pred = model(seq).squeeze()  # 压缩维度:得到输出,并将维度为1的去除
            single_loss = loss_function(y_pred, seq)
            single_loss.backward()
            optimizer.step()
            epoch_loss_list.append(single_loss.detach().numpy())
        logger.debug("Train Step:{} loss: {}", i, np.mean(epoch_loss_list))
        loss_list.append(np.mean(epoch_loss_list))
    return model, np.min(loss_list)


# 步骤4:模型训练完成,开始找正常数据与异常数据的阈值
def set_threshold(auto_encoder_model, error_data: np.ndarray, normal_loss):
    """阈值计算:这里通过正常数据的最小loss与最大异常的loss取中间值,作为阈值的loss"""
    error_tensor = torch.tensor(error_data).float()
    error_loader = Data.DataLoader(
        dataset=Data.TensorDataset(error_tensor),
        shuffle=True,
        num_workers=2,
    )
    loss_function = nn.MSELoss()
    error_loss = []
    auto_encoder_model.eval()
    for seq in error_loader:
        seq = seq[0]
        y_pred = auto_encoder_model(seq).squeeze()
        loss = loss_function(y_pred, seq)
        error_loss.append(loss.detach().numpy())
    threshold = (np.min(error_loss) - normal_loss) / 2
    logger.info("阈值计算完毕:正常样本的最大loss:{},异常样本的最小loss:{},阈值:{}", normal_loss, np.min(error_loss), threshold)
    return threshold


# 步骤5:异常检测
def anomaly_detection(auto_encoder_model, threshold, x_test, y_test):
    # 这里构建测试数据
    test_tensor = torch.tensor(x_test).float()
    test_loader = Data.DataLoader(
        dataset=Data.TensorDataset(test_tensor),
        shuffle=True,
        num_workers=2,
    )
    pred_list = []
    loss_function = nn.MSELoss()
    # 通过 Auto Encoder 模型进行预测
    auto_encoder_model.eval()
    for seq in test_loader:
        seq = seq[0]
        y_pred = auto_encoder_model(seq).squeeze()
        loss = loss_function(y_pred, seq)
        if loss < threshold:
            pred_list.append(0)
        else:
            pred_list.append(1)  # 异常
    # 评价指标
    logger.info("准确率:{} 召回率:{}", accuracy_score(y_test, pred_list), recall_score(y_test, pred_list))


def main():
    x_train, x_test, y_train, y_test = get_data()
    model, normal_loss = train_auto_encoder(x_train[y_train == 0])  # 拿正例数据进行训练
    threshold = set_threshold(model, x_train[y_train == 1], normal_loss)
    anomaly_detection(model, threshold, x_test, y_test)


if __name__ == '__main__':
    main()

打印的日志:

2022-03-25 16:22:45.409 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:0 loss: 7.666098117828369
2022-03-25 16:22:47.507 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:1 loss: 0.2303541898727417
2022-03-25 16:22:49.653 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:2 loss: 0.22991487383842468
2022-03-25 16:22:51.769 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:3 loss: 0.2299502193927765
2022-03-25 16:22:53.881 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:4 loss: 0.23001298308372498
2022-03-25 16:22:56.002 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:5 loss: 0.23006229102611542
2022-03-25 16:22:58.057 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:6 loss: 0.23018917441368103
2022-03-25 16:23:00.111 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:7 loss: 0.23030006885528564
2022-03-25 16:23:02.169 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:8 loss: 0.23039482533931732
2022-03-25 16:23:04.224 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:9 loss: 0.23047684133052826
2022-03-25 16:23:06.284 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:10 loss: 0.23054634034633636
2022-03-25 16:23:08.323 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:11 loss: 0.23061569035053253
2022-03-25 16:23:10.347 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:12 loss: 0.23073334991931915
2022-03-25 16:23:12.372 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:13 loss: 0.22996807098388672
2022-03-25 16:23:14.427 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:14 loss: 0.2291102260351181
2022-03-25 16:23:16.485 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:15 loss: 0.2279823124408722
2022-03-25 16:23:18.540 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:16 loss: 0.22661711275577545
2022-03-25 16:23:20.598 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:17 loss: 0.22481602430343628
2022-03-25 16:23:22.646 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:18 loss: 0.221832737326622
2022-03-25 16:23:24.710 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:19 loss: 0.2177421599626541
2022-03-25 16:23:26.776 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:20 loss: 0.21366573870182037
2022-03-25 16:23:28.779 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:21 loss: 0.2092127650976181
2022-03-25 16:23:30.764 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:22 loss: 0.2051170915365219
2022-03-25 16:23:32.736 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:23 loss: 0.19992759823799133
2022-03-25 16:23:34.755 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:24 loss: 0.19542557001113892
2022-03-25 16:23:36.771 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:25 loss: 0.1883782595396042
2022-03-25 16:23:38.757 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:26 loss: 0.1813327670097351
2022-03-25 16:23:40.760 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:27 loss: 0.17500406503677368
2022-03-25 16:23:42.744 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:28 loss: 0.16993017494678497
2022-03-25 16:23:44.727 | DEBUG    | __main__:train_auto_encoder:93 - Train Step:29 loss: 0.16403226554393768
2022-03-25 16:23:47.161 | INFO     | __main__:set_threshold:116 - 阈值计算完毕:正常样本的最大loss:0.16403226554393768,异常样本的最小loss:15.091814041137695,阈值:7.46389102935791
2022-03-25 16:23:49.223 | INFO     | __main__:anomaly_detection:142 - 准确率:0.694 召回率:0.235