|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Day 12 - Configuration Settings for Great Expectations Data Quality Framework |
| 4 | +Security-focused data validation configuration |
| 5 | +""" |
| 6 | + |
| 7 | +import os |
| 8 | +from pathlib import Path |
| 9 | +from dotenv import load_dotenv |
| 10 | + |
| 11 | +# Load environment variables |
| 12 | +load_dotenv() |
| 13 | + |
| 14 | +# ============================================================================ |
| 15 | +# DAY 12 CONFIGURATION - GREAT EXPECTATIONS DATA QUALITY |
| 16 | +# ============================================================================ |
| 17 | + |
| 18 | +# Project paths |
| 19 | +DAY12_PROJECT_ROOT = Path(__file__).parent |
| 20 | +DAY12_DATA_DIR = DAY12_PROJECT_ROOT / "data" |
| 21 | +DAY12_GE_DIR = DAY12_PROJECT_ROOT / "great_expectations" |
| 22 | +DAY12_LOGS_DIR = DAY12_PROJECT_ROOT / "logs" |
| 23 | + |
| 24 | +# Data source paths |
| 25 | +DAY12_SECURITY_EVENTS_PATH = DAY12_DATA_DIR / "day12_security_events.csv" |
| 26 | +DAY12_COMPLIANCE_AUDIT_PATH = DAY12_DATA_DIR / "day12_compliance_audit.csv" |
| 27 | + |
| 28 | +# Great Expectations configuration |
| 29 | +DAY12_GE_PROJECT_NAME = "day12_security_data_quality" |
| 30 | +DAY12_GE_DATASOURCE_NAME = "day12_security_logs_datasource" |
| 31 | +DAY12_GE_EXPECTATION_SUITE_NAME = "day12_security_validation_suite" |
| 32 | +DAY12_GE_CHECKPOINT_NAME = "day12_security_checkpoint" |
| 33 | + |
| 34 | +# Data quality thresholds (cybersecurity-specific) |
| 35 | +DAY12_THRESHOLD_NULL_EVENT_IDS = 0.02 # Max 2% null event IDs acceptable |
| 36 | +DAY12_THRESHOLD_FUTURE_TIMESTAMPS = 0.01 # Max 1% future timestamps |
| 37 | +DAY12_THRESHOLD_PII_LEAKAGE = 0.01 # Max 1% PII in username fields |
| 38 | +DAY12_THRESHOLD_MISSING_CRITICAL_FIELDS = 0.05 # Max 5% missing critical fields |
| 39 | +DAY12_THRESHOLD_SEVERITY_RISK_CORRELATION = 0.95 # 95% must correlate correctly |
| 40 | + |
| 41 | +# Severity to risk score mapping (for validation) |
| 42 | +DAY12_SEVERITY_RISK_MAPPING = { |
| 43 | + 'critical': (90, 100), |
| 44 | + 'high': (70, 89), |
| 45 | + 'medium': (40, 69), |
| 46 | + 'low': (10, 39), |
| 47 | + 'info': (0, 9) |
| 48 | +} |
| 49 | + |
| 50 | +# Required fields for security logs (completeness check) |
| 51 | +DAY12_REQUIRED_SECURITY_FIELDS = [ |
| 52 | + 'event_id', |
| 53 | + 'timestamp', |
| 54 | + 'event_type', |
| 55 | + 'severity', |
| 56 | + 'source_system', |
| 57 | + 'action_taken', |
| 58 | + 'username', |
| 59 | + 'source_ip', |
| 60 | + 'risk_score', |
| 61 | + 'status' |
| 62 | +] |
| 63 | + |
| 64 | +# Valid values for categorical fields |
| 65 | +DAY12_VALID_SEVERITIES = ['critical', 'high', 'medium', 'low', 'info'] |
| 66 | +DAY12_VALID_ACTIONS = ['allowed', 'blocked', 'quarantined', 'alerted', 'logged'] |
| 67 | +DAY12_VALID_STATUSES = ['open', 'investigating', 'resolved', 'false_positive'] |
| 68 | + |
| 69 | +# Compliance tags |
| 70 | +DAY12_VALID_COMPLIANCE_TAGS = ['HIPAA', 'PCI-DSS', 'SOX', 'GDPR', None] |
| 71 | + |
| 72 | +# Timestamp validation |
| 73 | +DAY12_MIN_TIMESTAMP = "2024-01-01T00:00:00" # Events shouldn't be older than this |
| 74 | +DAY12_MAX_FUTURE_HOURS = 1 # Max 1 hour in future (for clock skew tolerance) |
| 75 | + |
| 76 | +# PII detection patterns (regex for username validation) |
| 77 | +DAY12_PII_EMAIL_PATTERN = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$' |
| 78 | + |
| 79 | +# Risk score bounds |
| 80 | +DAY12_MIN_RISK_SCORE = 0 |
| 81 | +DAY12_MAX_RISK_SCORE = 100 |
| 82 | + |
| 83 | +# Failure actions (what to do when expectations fail) |
| 84 | +DAY12_FAILURE_ACTIONS = { |
| 85 | + 'critical_failure': { |
| 86 | + 'action': 'block_pipeline', |
| 87 | + 'notify': True, |
| 88 | + 'log_level': 'ERROR', |
| 89 | + 'description': 'Stop processing and alert security team' |
| 90 | + }, |
| 91 | + 'warning': { |
| 92 | + 'action': 'continue_with_warning', |
| 93 | + 'notify': True, |
| 94 | + 'log_level': 'WARNING', |
| 95 | + 'description': 'Log issue but continue processing' |
| 96 | + }, |
| 97 | + 'info': { |
| 98 | + 'action': 'log_only', |
| 99 | + 'notify': False, |
| 100 | + 'log_level': 'INFO', |
| 101 | + 'description': 'Record for audit purposes only' |
| 102 | + } |
| 103 | +} |
| 104 | + |
| 105 | +# Expectation to failure action mapping |
| 106 | +DAY12_EXPECTATION_FAILURE_MAPPING = { |
| 107 | + 'expect_column_values_to_not_be_null': 'critical_failure', # Missing critical fields |
| 108 | + 'expect_column_values_to_match_regex': 'critical_failure', # PII leakage |
| 109 | + 'expect_column_values_to_be_between': 'warning', # Risk score bounds |
| 110 | + 'expect_column_values_to_be_in_set': 'warning', # Invalid categorical values |
| 111 | + 'expect_table_row_count_to_be_between': 'info', # Row count monitoring |
| 112 | +} |
| 113 | + |
| 114 | +# Validation results storage |
| 115 | +DAY12_VALIDATION_RESULTS_DIR = DAY12_LOGS_DIR / "validation_results" |
| 116 | +DAY12_VALIDATION_RESULTS_DIR.mkdir(exist_ok=True, parents=True) |
| 117 | + |
| 118 | +# Notification settings (environment variables) |
| 119 | +DAY12_NOTIFY_ON_FAILURE = os.getenv('DAY12_NOTIFY_ON_FAILURE', 'true').lower() == 'true' |
| 120 | +DAY12_SLACK_WEBHOOK_URL = os.getenv('DAY12_SLACK_WEBHOOK_URL', None) |
| 121 | +DAY12_ALERT_EMAIL = os.getenv('DAY12_ALERT_EMAIL', None) |
| 122 | + |
| 123 | +# Logging configuration |
| 124 | +DAY12_LOG_FILE = DAY12_LOGS_DIR / "day12_validation.log" |
| 125 | +DAY12_LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" |
| 126 | +DAY12_LOG_LEVEL = os.getenv('DAY12_LOG_LEVEL', 'INFO') |
| 127 | + |
| 128 | +# Great Expectations Data Docs configuration |
| 129 | +DAY12_DATA_DOCS_SITE_NAME = "day12_local_site" |
| 130 | +DAY12_BUILD_DATA_DOCS = True |
| 131 | + |
| 132 | +# Integration settings (for use in other projects) |
| 133 | +DAY12_CHECKPOINT_RUN_NAME_PREFIX = "day12_validation_run" |
| 134 | +DAY12_VALIDATION_OPERATOR_NAME = "day12_action_list_operator" |
| 135 | + |
| 136 | +# Performance settings |
| 137 | +DAY12_SAMPLE_SIZE = None # None = validate all rows, or set to int for sampling |
| 138 | +DAY12_ENABLE_PROFILING = True # Generate data profiling reports |
| 139 | + |
| 140 | +# Environment-specific overrides |
| 141 | +DAY12_ENVIRONMENT = os.getenv('DAY12_ENVIRONMENT', 'development') |
| 142 | + |
| 143 | +if DAY12_ENVIRONMENT == 'production': |
| 144 | + # Stricter thresholds in production |
| 145 | + DAY12_THRESHOLD_NULL_EVENT_IDS = 0.001 |
| 146 | + DAY12_THRESHOLD_PII_LEAKAGE = 0.0001 |
| 147 | + DAY12_THRESHOLD_FUTURE_TIMESTAMPS = 0.001 |
| 148 | + |
| 149 | +# Display configuration summary |
| 150 | +def day12_print_config_summary(): |
| 151 | + """Print configuration summary for verification""" |
| 152 | + print("=" * 80) |
| 153 | + print("DAY 12 - GREAT EXPECTATIONS CONFIGURATION SUMMARY") |
| 154 | + print("=" * 80) |
| 155 | + print(f"Environment: {DAY12_ENVIRONMENT}") |
| 156 | + print(f"Project Root: {DAY12_PROJECT_ROOT}") |
| 157 | + print(f"Data Directory: {DAY12_DATA_DIR}") |
| 158 | + print(f"GE Directory: {DAY12_GE_DIR}") |
| 159 | + print(f"Expectation Suite: {DAY12_GE_EXPECTATION_SUITE_NAME}") |
| 160 | + print(f"Checkpoint: {DAY12_GE_CHECKPOINT_NAME}") |
| 161 | + print(f"\nData Quality Thresholds:") |
| 162 | + print(f" - Null Event IDs: {DAY12_THRESHOLD_NULL_EVENT_IDS * 100}%") |
| 163 | + print(f" - PII Leakage: {DAY12_THRESHOLD_PII_LEAKAGE * 100}%") |
| 164 | + print(f" - Future Timestamps: {DAY12_THRESHOLD_FUTURE_TIMESTAMPS * 100}%") |
| 165 | + print(f"\nNotifications:") |
| 166 | + print(f" - Notify on Failure: {DAY12_NOTIFY_ON_FAILURE}") |
| 167 | + print(f" - Slack Webhook: {'Configured' if DAY12_SLACK_WEBHOOK_URL else 'Not configured'}") |
| 168 | + print("=" * 80) |
| 169 | + |
| 170 | + |
| 171 | +if __name__ == "__main__": |
| 172 | + day12_print_config_summary() |
0 commit comments