
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# Load Cleaned Data
# =============================================================================
df = pd.read_csv('temp_files/cleaned.csv', parse_dates=['sys_created_on', 'closed_at'])

print("=" * 70)
print("PHASE 3B: FEATURE ENGINEERING FOR SLA BREACH PREDICTION")
print("=" * 70)

# =============================================================================
# Filter to Resolved Tickets Only (for modeling)
# =============================================================================
model_df = df[df['is_resolved'] == True].copy()
print(f"\nModeling dataset: {len(model_df):,} resolved tickets")

# =============================================================================
# Feature Engineering
# =============================================================================

# 1. Target variable
model_df['sla_breached'] = model_df['sla_breached'].astype(int)

# 2. Priority features (already numeric)
model_df['priority_num'] = model_df['priority'].str.extract(r'(\d+)').astype(int)

# 3. Temporal features
model_df['created_hour'] = model_df['sys_created_on'].dt.hour
model_df['created_dow'] = model_df['sys_created_on'].dt.dayofweek
model_df['created_month'] = model_df['sys_created_on'].dt.month
model_df['is_weekend'] = (model_df['created_dow'] >= 5).astype(int)
model_df['is_business_hours'] = ((model_df['created_hour'] >= 9) & (model_df['created_hour'] <= 17)).astype(int)

# 4. Reduce cardinality of assignment_group
# Group assignment groups with < 50 tickets into "Other"
group_counts = model_df['assignment_group'].value_counts()
rare_groups = group_counts[group_counts < 50].index
model_df['assignment_group_agg'] = model_df['assignment_group'].replace(rare_groups, 'Other-Rare-Group')
print(f"Reduced assignment groups from {model_df['assignment_group'].nunique()} to {model_df['assignment_group_agg'].nunique()}")

# 5. Encode categorical variables using one-hot encoding
cat_features = ['priority', 'assignment_group_agg', 'Tower', 'Domain', 'incident_state']
model_encoded = pd.get_dummies(model_df, columns=cat_features, drop_first=True)

# 6. Select feature columns for modeling
exclude_cols = ['Tkt #', 'short_description', 'sys_created_on', 'closed_at', 
                'mttr_hours', 'ticket_age_hours', 'is_resolved', 'sla_threshold_hours',
                'mttr_bucket', 'created_dow_name', 'status_category', 'assignment_group',
                'sla_breached', 'priority_num']  # priority_num excluded since we use priority dummies

feature_cols = [c for c in model_encoded.columns if c not in exclude_cols]
print(f"Feature count: {len(feature_cols)}")

# 7. Prepare X and y
X = model_encoded[feature_cols]
y = model_encoded['sla_breached']

print(f"\nTarget distribution:")
print(y.value_counts())
print(f"Breach rate: {y.mean()*100:.1f}%")

# 8. Save engineered features
feature_data = pd.concat([X, y], axis=1)
feature_data.to_csv('temp_files/engineered_features.csv', index=False)

# Save feature column list
with open('temp_files/feature_columns.txt', 'w') as f:
    for col in feature_cols:
        f.write(col + '\n')

print(f"\nEngineered features saved to: temp_files/engineered_features.csv")
print(f"Feature columns saved to: temp_files/feature_columns.txt")

print("\n" + "=" * 70)
print("FEATURE ENGINEERING COMPLETE")
print("=" * 70)
