Skip to content

Commit 9957803

Browse files
committed
Add Great Expectations framework for data quality validation of cybersecurity datasets
- Created requirements file for Day 12 including Great Expectations, Pandas, and other dependencies. - Generated HTML report for security events data quality validation with detailed expectations and results. - Added multiple JSON files for validation results capturing various timestamps and detailed expectation outcomes.
1 parent 9d742d7 commit 9957803

17 files changed

+4756
-1
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ Each one ships with full code and documentation.
3434
| 9 | Modeling | Property Operations Data Warehouse (dbt) | Hospitality/Property Management | ✅ Complete | [Day 09](./day09) |
3535
| 10 | Modeling | Family Office Asset Management Data Warehouse | Wealth Management/Finance | ✅ Complete | [Day 10](./day10) |
3636
| 11 | Orchestration | Retail Daily Performance Report Automation | TBD | ✅ Complete | [Day 11](./day11) |
37-
| 12 | Orchestration | TBD | TBD | 🚧 Planned | [Day 12](./day12) |
37+
| 12 | Orchestration | Cybersecurity Data Quality Framework | TBD | ✅ Complete | [Day 12](./day12) |
3838
| 13 | Orchestration | TBD | TBD | 🚧 Planned | [Day 13](./day13) |
3939
| 14 | Orchestration | TBD | TBD | 🚧 Planned | [Day 14](./day14) |
4040
| 15 | Orchestration | TBD | TBD | 🚧 Planned | [Day 15](./day15) |

day12/.env.example

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# ==============================================================================
2+
# DAY 12 - GREAT EXPECTATIONS DATA QUALITY FRAMEWORK CONFIGURATION
3+
# ==============================================================================
4+
# Copy this file to config/.env and fill in your values
5+
# This demonstrates a simpler validation framework using GE concepts
6+
7+
# Environment
8+
DAY12_ENVIRONMENT=development # Options: development, production
9+
10+
# Logging
11+
DAY12_LOG_LEVEL=INFO # Options: DEBUG, INFO, WARNING, ERROR
12+
13+
# Data Quality Thresholds (Production uses stricter thresholds)
14+
# Development thresholds:
15+
DAY12_THRESHOLD_NULL_EVENT_IDS=0.02 # Max 2% null event IDs
16+
DAY12_THRESHOLD_PII_LEAKAGE=0.01 # Max 1% PII in usernames
17+
DAY12_THRESHOLD_FUTURE_TIMESTAMPS=0.01 # Max 1% future timestamps
18+
19+
# Notification Settings (Optional)
20+
DAY12_NOTIFY_ON_FAILURE=true
21+
DAY12_SLACK_WEBHOOK_URL=https://hooks.slack.com/services/YOUR/WEBHOOK/URL
22+
DAY12_ALERT_EMAIL=security-alerts@yourcompany.com
23+
24+
# Great Expectations Project Paths (Auto-configured, override if needed)
25+
# DAY12_GE_PROJECT_DIR=/path/to/custom/ge/project
26+
# DAY12_DATA_SOURCE_PATH=/path/to/custom/data
27+
28+
# ==============================================================================
29+
# USAGE INSTRUCTIONS
30+
# ==============================================================================
31+
# 1. Copy this file: cp .env.example ../config/.env
32+
# 2. Edit config/.env with your actual values
33+
# 3. Run validation: python3 day12_VALIDATOR_cybersecurity.py
34+
# 4. Check logs in: logs/day12_validation.log
35+
# 5. View results in: logs/validation_results/
36+
37+
# ==============================================================================
38+
# INTEGRATION WITH OTHER PROJECTS
39+
# ==============================================================================
40+
# This validation framework can be imported and used in other Day projects:
41+
#
42+
# from day12.day12_VALIDATOR_cybersecurity import Day12DataQualityValidator
43+
#
44+
# validator = Day12DataQualityValidator(your_dataframe, "your_dataset")
45+
# validator.expect_column_values_to_not_be_null('critical_field')
46+
# results = validator.validation_results

day12/README.md

Lines changed: 875 additions & 0 deletions
Large diffs are not rendered by default.

day12/data/day12_compliance_audit.csv

Lines changed: 501 additions & 0 deletions
Large diffs are not rendered by default.

day12/data/day12_security_events.csv

Lines changed: 1001 additions & 0 deletions
Large diffs are not rendered by default.

day12/day12_CONFIG_settings.py

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Day 12 - Configuration Settings for Great Expectations Data Quality Framework
4+
Security-focused data validation configuration
5+
"""
6+
7+
import os
8+
from pathlib import Path
9+
from dotenv import load_dotenv
10+
11+
# Load environment variables
12+
load_dotenv()
13+
14+
# ============================================================================
15+
# DAY 12 CONFIGURATION - GREAT EXPECTATIONS DATA QUALITY
16+
# ============================================================================
17+
18+
# Project paths
19+
DAY12_PROJECT_ROOT = Path(__file__).parent
20+
DAY12_DATA_DIR = DAY12_PROJECT_ROOT / "data"
21+
DAY12_GE_DIR = DAY12_PROJECT_ROOT / "great_expectations"
22+
DAY12_LOGS_DIR = DAY12_PROJECT_ROOT / "logs"
23+
24+
# Data source paths
25+
DAY12_SECURITY_EVENTS_PATH = DAY12_DATA_DIR / "day12_security_events.csv"
26+
DAY12_COMPLIANCE_AUDIT_PATH = DAY12_DATA_DIR / "day12_compliance_audit.csv"
27+
28+
# Great Expectations configuration
29+
DAY12_GE_PROJECT_NAME = "day12_security_data_quality"
30+
DAY12_GE_DATASOURCE_NAME = "day12_security_logs_datasource"
31+
DAY12_GE_EXPECTATION_SUITE_NAME = "day12_security_validation_suite"
32+
DAY12_GE_CHECKPOINT_NAME = "day12_security_checkpoint"
33+
34+
# Data quality thresholds (cybersecurity-specific)
35+
DAY12_THRESHOLD_NULL_EVENT_IDS = 0.02 # Max 2% null event IDs acceptable
36+
DAY12_THRESHOLD_FUTURE_TIMESTAMPS = 0.01 # Max 1% future timestamps
37+
DAY12_THRESHOLD_PII_LEAKAGE = 0.01 # Max 1% PII in username fields
38+
DAY12_THRESHOLD_MISSING_CRITICAL_FIELDS = 0.05 # Max 5% missing critical fields
39+
DAY12_THRESHOLD_SEVERITY_RISK_CORRELATION = 0.95 # 95% must correlate correctly
40+
41+
# Severity to risk score mapping (for validation)
42+
DAY12_SEVERITY_RISK_MAPPING = {
43+
'critical': (90, 100),
44+
'high': (70, 89),
45+
'medium': (40, 69),
46+
'low': (10, 39),
47+
'info': (0, 9)
48+
}
49+
50+
# Required fields for security logs (completeness check)
51+
DAY12_REQUIRED_SECURITY_FIELDS = [
52+
'event_id',
53+
'timestamp',
54+
'event_type',
55+
'severity',
56+
'source_system',
57+
'action_taken',
58+
'username',
59+
'source_ip',
60+
'risk_score',
61+
'status'
62+
]
63+
64+
# Valid values for categorical fields
65+
DAY12_VALID_SEVERITIES = ['critical', 'high', 'medium', 'low', 'info']
66+
DAY12_VALID_ACTIONS = ['allowed', 'blocked', 'quarantined', 'alerted', 'logged']
67+
DAY12_VALID_STATUSES = ['open', 'investigating', 'resolved', 'false_positive']
68+
69+
# Compliance tags
70+
DAY12_VALID_COMPLIANCE_TAGS = ['HIPAA', 'PCI-DSS', 'SOX', 'GDPR', None]
71+
72+
# Timestamp validation
73+
DAY12_MIN_TIMESTAMP = "2024-01-01T00:00:00" # Events shouldn't be older than this
74+
DAY12_MAX_FUTURE_HOURS = 1 # Max 1 hour in future (for clock skew tolerance)
75+
76+
# PII detection patterns (regex for username validation)
77+
DAY12_PII_EMAIL_PATTERN = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
78+
79+
# Risk score bounds
80+
DAY12_MIN_RISK_SCORE = 0
81+
DAY12_MAX_RISK_SCORE = 100
82+
83+
# Failure actions (what to do when expectations fail)
84+
DAY12_FAILURE_ACTIONS = {
85+
'critical_failure': {
86+
'action': 'block_pipeline',
87+
'notify': True,
88+
'log_level': 'ERROR',
89+
'description': 'Stop processing and alert security team'
90+
},
91+
'warning': {
92+
'action': 'continue_with_warning',
93+
'notify': True,
94+
'log_level': 'WARNING',
95+
'description': 'Log issue but continue processing'
96+
},
97+
'info': {
98+
'action': 'log_only',
99+
'notify': False,
100+
'log_level': 'INFO',
101+
'description': 'Record for audit purposes only'
102+
}
103+
}
104+
105+
# Expectation to failure action mapping
106+
DAY12_EXPECTATION_FAILURE_MAPPING = {
107+
'expect_column_values_to_not_be_null': 'critical_failure', # Missing critical fields
108+
'expect_column_values_to_match_regex': 'critical_failure', # PII leakage
109+
'expect_column_values_to_be_between': 'warning', # Risk score bounds
110+
'expect_column_values_to_be_in_set': 'warning', # Invalid categorical values
111+
'expect_table_row_count_to_be_between': 'info', # Row count monitoring
112+
}
113+
114+
# Validation results storage
115+
DAY12_VALIDATION_RESULTS_DIR = DAY12_LOGS_DIR / "validation_results"
116+
DAY12_VALIDATION_RESULTS_DIR.mkdir(exist_ok=True, parents=True)
117+
118+
# Notification settings (environment variables)
119+
DAY12_NOTIFY_ON_FAILURE = os.getenv('DAY12_NOTIFY_ON_FAILURE', 'true').lower() == 'true'
120+
DAY12_SLACK_WEBHOOK_URL = os.getenv('DAY12_SLACK_WEBHOOK_URL', None)
121+
DAY12_ALERT_EMAIL = os.getenv('DAY12_ALERT_EMAIL', None)
122+
123+
# Logging configuration
124+
DAY12_LOG_FILE = DAY12_LOGS_DIR / "day12_validation.log"
125+
DAY12_LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
126+
DAY12_LOG_LEVEL = os.getenv('DAY12_LOG_LEVEL', 'INFO')
127+
128+
# Great Expectations Data Docs configuration
129+
DAY12_DATA_DOCS_SITE_NAME = "day12_local_site"
130+
DAY12_BUILD_DATA_DOCS = True
131+
132+
# Integration settings (for use in other projects)
133+
DAY12_CHECKPOINT_RUN_NAME_PREFIX = "day12_validation_run"
134+
DAY12_VALIDATION_OPERATOR_NAME = "day12_action_list_operator"
135+
136+
# Performance settings
137+
DAY12_SAMPLE_SIZE = None # None = validate all rows, or set to int for sampling
138+
DAY12_ENABLE_PROFILING = True # Generate data profiling reports
139+
140+
# Environment-specific overrides
141+
DAY12_ENVIRONMENT = os.getenv('DAY12_ENVIRONMENT', 'development')
142+
143+
if DAY12_ENVIRONMENT == 'production':
144+
# Stricter thresholds in production
145+
DAY12_THRESHOLD_NULL_EVENT_IDS = 0.001
146+
DAY12_THRESHOLD_PII_LEAKAGE = 0.0001
147+
DAY12_THRESHOLD_FUTURE_TIMESTAMPS = 0.001
148+
149+
# Display configuration summary
150+
def day12_print_config_summary():
151+
"""Print configuration summary for verification"""
152+
print("=" * 80)
153+
print("DAY 12 - GREAT EXPECTATIONS CONFIGURATION SUMMARY")
154+
print("=" * 80)
155+
print(f"Environment: {DAY12_ENVIRONMENT}")
156+
print(f"Project Root: {DAY12_PROJECT_ROOT}")
157+
print(f"Data Directory: {DAY12_DATA_DIR}")
158+
print(f"GE Directory: {DAY12_GE_DIR}")
159+
print(f"Expectation Suite: {DAY12_GE_EXPECTATION_SUITE_NAME}")
160+
print(f"Checkpoint: {DAY12_GE_CHECKPOINT_NAME}")
161+
print(f"\nData Quality Thresholds:")
162+
print(f" - Null Event IDs: {DAY12_THRESHOLD_NULL_EVENT_IDS * 100}%")
163+
print(f" - PII Leakage: {DAY12_THRESHOLD_PII_LEAKAGE * 100}%")
164+
print(f" - Future Timestamps: {DAY12_THRESHOLD_FUTURE_TIMESTAMPS * 100}%")
165+
print(f"\nNotifications:")
166+
print(f" - Notify on Failure: {DAY12_NOTIFY_ON_FAILURE}")
167+
print(f" - Slack Webhook: {'Configured' if DAY12_SLACK_WEBHOOK_URL else 'Not configured'}")
168+
print("=" * 80)
169+
170+
171+
if __name__ == "__main__":
172+
day12_print_config_summary()

0 commit comments

Comments
 (0)