Introduction
The combination of Isolation Forest and Long Short-Term Memory (LSTM) networks represents a powerful approach to anomaly detection, leveraging the strengths of both unsupervised and supervised learning methods. This implementation demonstrates how these algorithms can work together to create a robust anomaly detection system.
Isolation Forest + LSTM Ensemble for Anomaly Detection
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from typing import Tuple, List, Optional
import time
class AnomalyDetectionEnsemble:
def __init__(self,
sequence_length: int = 10,
isolation_forest_contamination: float = 0.1,
lstm_units: int = 50,
ensemble_threshold: float = 0.5):
“””
Initialize the ensemble anomaly detector
Args:
sequence_length: Length of sequences for LSTM
isolation_forest_contamination: Expected proportion of anomalies
lstm_units: Number of LSTM units
ensemble_threshold: Threshold for combining predictions
“””
self.sequence_length = sequence_length
self.isolation_forest = IsolationForest(
contamination=isolation_forest_contamination,
random_state=42
)
self.lstm_model = self._build_lstm_model(lstm_units)
self.scaler = StandardScaler()
self.ensemble_threshold = ensemble_threshold
def _build_lstm_model(self, lstm_units: int) -> Sequential:
“””Build LSTM model for sequence prediction”””
model = Sequential([
LSTM(lstm_units, input_shape=(self.sequence_length, 1), return_sequences=True),
Dropout(0.2),
LSTM(lstm_units // 2),
Dropout(0.2),
Dense(1)
])
model.compile(optimizer=’adam’, loss=’mse’)
return model
def _prepare_sequences(self, data: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
“””Prepare sequences for LSTM training”””
X, y = [], []
for i in range(len(data) – self.sequence_length):
X.append(data[i:i + self.sequence_length])
y.append(data[i + self.sequence_length])
return np.array(X), np.array(y)
def fit(self, data: np.ndarray, epochs: int = 50, verbose: int = 1) -> None:
“””
Fit both models on the training data
Args:
data: 1D array of time series data
epochs: Number of epochs for LSTM training
verbose: Verbosity level for training
“””
# Scale data
scaled_data = self.scaler.fit_transform(data.reshape(-1, 1))
# Fit Isolation Forest
self.isolation_forest.fit(scaled_data)
# Prepare sequences and fit LSTM
X, y = self._prepare_sequences(scaled_data)
self.lstm_model.fit(
X, y,
epochs=epochs,
verbose=verbose,
validation_split=0.2
)
def predict(self, data: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
“””
Predict anomalies using both models
Returns:
Tuple of (ensemble predictions, isolation forest scores, lstm scores)
“””
# Scale data
scaled_data = self.scaler.transform(data.reshape(-1, 1))
# Isolation Forest predictions (-1 for anomalies, 1 for normal)
if_predictions = self.isolation_forest.predict(scaled_data)
if_scores = self.isolation_forest.score_samples(scaled_data)
# LSTM predictions
X, _ = self._prepare_sequences(scaled_data)
lstm_predictions = self.lstm_model.predict(X, verbose=0)
lstm_scores = np.abs(scaled_data[self.sequence_length:] – lstm_predictions.reshape(-1, 1))
# Normalize scores
if_scores = (if_scores – np.min(if_scores)) / (np.max(if_scores) – np.min(if_scores))
lstm_scores = (lstm_scores – np.min(lstm_scores)) / (np.max(lstm_scores) – np.min(lstm_scores))
# Combine predictions
ensemble_scores = 0.5 * (1 – if_scores) + 0.5 * lstm_scores.flatten()
ensemble_predictions = (ensemble_scores > self.ensemble_threshold).astype(int)
return ensemble_predictions, if_scores, lstm_scores.flatten()
def generate_synthetic_data(n_points: int = 1000) -> Tuple[np.ndarray, np.ndarray]:
“””Generate synthetic time series data with anomalies”””
# Generate normal data
t = np.linspace(0, 10, n_points)
normal_data = np.sin(t) + np.random.normal(0, 0.1, n_points)
# Add anomalies
anomalies = np.zeros(n_points)
anomaly_indices = np.random.choice(n_points, size=int(0.05 * n_points), replace=False)
normal_data[anomaly_indices] += np.random.normal(0, 0.5, len(anomaly_indices))
anomalies[anomaly_indices] = 1
return normal_data, anomalies
def plot_results(data: np.ndarray,
true_anomalies: np.ndarray,
predicted_anomalies: np.ndarray,
if_scores: np.ndarray,
lstm_scores: np.ndarray) -> None:
“””Plot the results of anomaly detection”””
plt.figure(figsize=(15, 10))
# Plot original data and anomalies
plt.subplot(3, 1, 1)
plt.plot(data, label=’Original Data’)
plt.scatter(np.where(true_anomalies == 1)[0],
data[true_anomalies == 1],
color=’red’, label=’True Anomalies’)
plt.scatter(np.where(predicted_anomalies == 1)[0],
data[predicted_anomalies == 1],
color=’green’, marker=’x’, label=’Predicted Anomalies’)
plt.legend()
plt.title(‘Time Series with Anomalies’)
# Plot Isolation Forest scores
plt.subplot(3, 1, 2)
plt.plot(if_scores, label=’Isolation Forest Scores’)
plt.legend()
plt.title(‘Isolation Forest Anomaly Scores’)
# Plot LSTM scores
plt.subplot(3, 1, 3)
plt.plot(lstm_scores, label=’LSTM Reconstruction Error’)
plt.legend()
plt.title(‘LSTM Anomaly Scores’)
plt.tight_layout()
plt.show()
def demonstrate_ensemble():
“””Demonstrate the ensemble anomaly detector”””
# Generate synthetic data
print(“Generating synthetic data…”)
data, true_anomalies = generate_synthetic_data()
# Create and train ensemble
print(“\nTraining ensemble detector…”)
detector = AnomalyDetectionEnsemble()
detector.fit(data, epochs=20, verbose=0)
# Make predictions
print(“Making predictions…”)
predictions, if_scores, lstm_scores = detector.predict(data)
# Calculate metrics
true_positives = np.sum((predictions == 1) & (true_anomalies == 1))
false_positives = np.sum((predictions == 1) & (true_anomalies == 0))
false_negatives = np.sum((predictions == 0) & (true_anomalies == 1))
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
f1_score = 2 * (precision * recall) / (precision + recall)
print(“\nResults:”)
print(f”Precision: {precision:.3f}”)
print(f”Recall: {recall:.3f}”)
print(f”F1 Score: {f1_score:.3f}”)
# Plot results
plot_results(data, true_anomalies, predictions, if_scores, lstm_scores)
if __name__ == “__main__”:
demonstrate_ensemble()
Algorithm Design
The ensemble combines the strengths of two distinct approaches to anomaly detection. The Isolation Forest algorithm excels at identifying global outliers through recursive partitioning, while the LSTM network captures complex temporal patterns and dependencies in the data sequence. This combination enables the detection of both point anomalies and contextual anomalies within time series data.
Implementation Details
The implementation features several key components:
1. Data Preprocessing: The system implements standardization and sequence preparation for LSTM training, ensuring optimal performance for both algorithms.
2. Isolation Forest: The unsupervised component identifies global outliers based on the isolation principle, where anomalies are typically easier to isolate than normal points.
3. LSTM Architecture: The supervised component uses a deep LSTM network with dropout layers to prevent overfitting and capture temporal dependencies in the data.
4. Ensemble Integration: The system combines predictions from both models using a weighted approach, allowing for flexible adjustment of each model’s influence on the final prediction.
Performance Optimization
The implementation includes several optimizations to enhance performance:
• Efficient sequence preparation for LSTM training
• Vectorized operations for score calculation
• Balanced dropout layers to prevent overfitting
• Normalized scoring system for consistent ensemble integration
Visualization and Analysis
The system provides comprehensive visualization capabilities:
• Time series plots with highlighted anomalies
• Individual model score distributions
• Ensemble prediction results
Future Developments
Ongoing research continues to enhance the ensemble’s capabilities through:
• Advanced architecture optimization
• Adaptive weighting mechanisms
• Improved score normalization techniques
• Enhanced visualization capabilities
Conclusion
The Isolation Forest-LSTM ensemble represents a powerful approach to anomaly detection, combining the strengths of both supervised and unsupervised learning. Organizations can leverage this implementation to develop robust anomaly detection systems tailored to their specific needs.
Technical Support
For detailed implementation guidance and technical documentation, contact our AI systems team at business@decentcybersecurity.eu. Our experts can assist in developing customized anomaly detection solutions that meet your specific requirements while ensuring optimal performance.
Decent Cybersecurity provides advanced AI solutions for anomaly detection worldwide. Our systems ensure accurate detection while maintaining computational efficiency.
When you run this code, it will:
1. Generate synthetic time series data with embedded anomalies
2. Train the ensemble detector
3. Make predictions on the test data
4. Display performance metrics
5. Generate visualizations showing the original data, true anomalies, predicted anomalies, and individual model scores
The output will include precision, recall, and F1 score metrics, along with three plots showing:
1. The original time series with marked true and predicted anomalies
2. Isolation Forest anomaly scores
3. LSTM reconstruction error scores
Execution Result:
Generating synthetic data…
Training ensemble detector…