
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# Load Data
# =============================================================================
file_path = 'user_source_data/inc_dump 1.xlsx'
df = pd.read_excel(file_path)

print("=" * 70)
print("PHASE 2: CLEANING & EXPLORATORY DATA ANALYSIS")
print("=" * 70)

# =============================================================================
# Cleaning Operations
# =============================================================================
print("\n--- Cleaning Operations ---")

# 1. Extract numeric priority level
df['priority_num'] = df['priority'].str.extract(r'(\d+)').astype(int)
print(f"Priority levels extracted: {sorted(df['priority_num'].unique())}")

# 2. Parse temporal columns (already datetime but ensure)
df['sys_created_on'] = pd.to_datetime(df['sys_created_on'])
df['closed_at'] = pd.to_datetime(df['closed_at'])

# 3. Create resolution timestamp: use closed_at if available, otherwise sys_created_on for open tickets
# For MTTR, we calculate from sys_created_on to closed_at (resolution)
df['is_resolved'] = df['closed_at'].notna()

# 4. Compute MTTR in hours for resolved tickets
df['mttr_hours'] = np.where(
    df['is_resolved'],
    (df['closed_at'] - df['sys_created_on']).dt.total_seconds() / 3600,
    np.nan
)

# 5. Compute ticket age in hours for open tickets (using current time: 2026-06-05)
current_time = pd.Timestamp('2026-06-05 08:35:22')
df['ticket_age_hours'] = np.where(
    ~df['is_resolved'],
    (current_time - df['sys_created_on']).dt.total_seconds() / 3600,
    np.nan
)

# 6. Define SLA thresholds (standard ITIL, hours)
sla_thresholds = {1: 4, 2: 8, 3: 24, 4: 72}
df['sla_threshold_hours'] = df['priority_num'].map(sla_thresholds)

# 7. SLA Breach flag for resolved tickets
df['sla_breached'] = np.where(
    df['is_resolved'],
    df['mttr_hours'] > df['sla_threshold_hours'],
    np.nan
)

# 8. Create categorical bins for MTTR
mttr_bins = [0, 4, 8, 24, 72, 168, 720, np.inf]
mttr_labels = ['0-4h', '4-8h', '8-24h', '1-3d', '3-7d', '7-30d', '30d+']
df['mttr_bucket'] = pd.cut(df['mttr_hours'], bins=mttr_bins, labels=mttr_labels, include_lowest=True)

# 9. Create hour-of-day and day-of-week features
df['created_hour'] = df['sys_created_on'].dt.hour
df['created_dow'] = df['sys_created_on'].dt.dayofweek  # 0=Monday
df['created_dow_name'] = df['sys_created_on'].dt.day_name()
df['created_month'] = df['sys_created_on'].dt.month

# 10. Simplify state into Open vs Closed categories
open_states = ['New', 'Assigned', 'Work in Progress', 'Pending']
df['status_category'] = df['incident_state'].apply(lambda x: 'Open' if x in open_states else 'Closed/Resolved')

print(f"Resolved tickets: {df['is_resolved'].sum():,}")
print(f"Open tickets: {(~df['is_resolved']).sum():,}")
print(f"SLA Breached (resolved): {df['sla_breached'].sum():,.0f}")
print(f"SLA Breach Rate (resolved): {df['sla_breached'].mean()*100:.1f}%")

# =============================================================================
# Save Cleaned Data
# =============================================================================
cleaned_path = 'temp_files/cleaned.csv'
df.to_csv(cleaned_path, index=False)
print(f"\nCleaned data saved to: {cleaned_path}")

# =============================================================================
# EDA Visualizations
# =============================================================================
print("\n--- Generating EDA Visualizations ---")

# Plot 1: Priority Distribution
fig1 = px.pie(df, names='priority', title='Incident Priority Distribution',
              color_discrete_sequence=px.colors.sequential.RdBu)
fig1.update_traces(textposition='inside', textinfo='percent+label')
fig1.write_html('assets/images/html/eda_priority_dist.html')
fig1.write_image('assets/images/png/eda_priority_dist.png', width=900, height=600, scale=2)
print("Saved: eda_priority_dist")

# Plot 2: Incident State Distribution
state_counts = df['incident_state'].value_counts().reset_index()
state_counts.columns = ['incident_state', 'count']
fig2 = px.bar(state_counts, x='incident_state', y='count', 
              title='Incident State Distribution',
              color='count', color_continuous_scale='Blues')
fig2.update_layout(xaxis_title='Incident State', yaxis_title='Count')
fig2.write_html('assets/images/html/eda_state_dist.html')
fig2.write_image('assets/images/png/eda_state_dist.png', width=1000, height=600, scale=2)
print("Saved: eda_state_dist")

# Plot 3: Top 15 Towers by Volume
tower_counts = df['Tower'].value_counts().head(15).reset_index()
tower_counts.columns = ['Tower', 'count']
fig3 = px.bar(tower_counts, x='Tower', y='count',
              title='Top 15 Towers by Incident Volume',
              color='count', color_continuous_scale='Viridis')
fig3.update_layout(xaxis_title='Tower', yaxis_title='Incident Count', xaxis_tickangle=-45)
fig3.write_html('assets/images/html/eda_tower_volume.html')
fig3.write_image('assets/images/png/eda_tower_volume.png', width=1100, height=600, scale=2)
print("Saved: eda_tower_volume")

# Plot 4: MTTR Distribution (resolved tickets only, log scale)
resolved_df = df[df['is_resolved']].copy()
fig4 = px.histogram(resolved_df, x='mttr_hours', nbins=100, log_y=True,
                    title='MTTR Distribution (Resolved Tickets) - Log Scale',
                    color_discrete_sequence=['#2E86AB'])
fig4.update_layout(xaxis_title='MTTR (Hours)', yaxis_title='Count (Log Scale)')
fig4.add_vline(x=resolved_df['mttr_hours'].median(), line_dash="dash", line_color="red",
               annotation_text=f"Median: {resolved_df['mttr_hours'].median():.1f}h")
fig4.write_html('assets/images/html/eda_mttr_dist.html')
fig4.write_image('assets/images/png/eda_mttr_dist.png', width=1000, height=600, scale=2)
print("Saved: eda_mttr_dist")

# Plot 5: MTTR by Priority Box Plot
fig5 = px.box(resolved_df, x='priority', y='mttr_hours', color='priority',
              title='MTTR by Priority Level',
              category_orders={'priority': ['1 - Critical', '2 - High', '3 - Medium', '4 - Low']})
fig5.update_layout(yaxis_type='log', xaxis_title='Priority', yaxis_title='MTTR (Hours, Log Scale)')
fig5.write_html('assets/images/html/eda_mttr_by_priority.html')
fig5.write_image('assets/images/png/eda_mttr_by_priority.png', width=1000, height=600, scale=2)
print("Saved: eda_mttr_by_priority")

# Plot 6: Domain vs Status Category
domain_status = df.groupby(['Domain', 'status_category']).size().reset_index(name='count')
fig6 = px.bar(domain_status, x='Domain', y='count', color='status_category',
              title='Open vs Closed Tickets by Domain',
              barmode='group', color_discrete_map={'Open':'#E94F37', 'Closed/Resolved':'#2E86AB'})
fig6.update_layout(xaxis_title='Domain', yaxis_title='Count', xaxis_tickangle=-35)
fig6.write_html('assets/images/html/eda_domain_status.html')
fig6.write_image('assets/images/png/eda_domain_status.png', width=1100, height=600, scale=2)
print("Saved: eda_domain_status")

# Plot 7: Ticket Creation Heatmap (Hour vs Day of Week)
heatmap_data = df.groupby(['created_dow_name', 'created_hour']).size().reset_index(name='count')
dow_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
heatmap_data['created_dow_name'] = pd.Categorical(heatmap_data['created_dow_name'], categories=dow_order, ordered=True)
heatmap_pivot = heatmap_data.pivot(index='created_dow_name', columns='created_hour', values='count').fillna(0)

fig7 = px.imshow(heatmap_pivot, 
                 title='Incident Creation Heatmap (Day vs Hour)',
                 color_continuous_scale='YlOrRd',
                 aspect='auto')
fig7.update_layout(xaxis_title='Hour of Day', yaxis_title='Day of Week')
fig7.write_html('assets/images/html/eda_creation_heatmap.html')
fig7.write_image('assets/images/png/eda_creation_heatmap.png', width=1200, height=500, scale=2)
print("Saved: eda_creation_heatmap")

# Plot 8: Open Ticket Aging Distribution
open_df = df[~df['is_resolved']].copy()
open_df['age_days'] = open_df['ticket_age_hours'] / 24
fig8 = px.histogram(open_df, x='age_days', nbins=50,
                    title='Open Ticket Aging Distribution',
                    color_discrete_sequence=['#E94F37'])
fig8.update_layout(xaxis_title='Age (Days)', yaxis_title='Count')
fig8.add_vline(x=open_df['age_days'].median(), line_dash="dash", line_color="blue",
               annotation_text=f"Median: {open_df['age_days'].median():.1f}d")
fig8.write_html('assets/images/html/eda_open_ticket_aging.html')
fig8.write_image('assets/images/png/eda_open_ticket_aging.png', width=1000, height=600, scale=2)
print("Saved: eda_open_ticket_aging")

# Summary statistics
print("\n--- Summary Statistics ---")
print(f"Total Tickets: {len(df):,}")
print(f"Resolved: {df['is_resolved'].sum():,} ({df['is_resolved'].mean()*100:.1f}%)")
print(f"Open: {(~df['is_resolved']).sum():,} ({(~df['is_resolved']).mean()*100:.1f}%)")
print(f"\nMTTR (hours) - Resolved Tickets:")
print(resolved_df['mttr_hours'].describe())
print(f"\nOpen Ticket Age (days):")
print(open_df['age_days'].describe())
print(f"\nSLA Breach Rate by Priority:")
print(df[df['is_resolved']].groupby('priority')['sla_breached'].mean() * 100)

print("\n" + "=" * 70)
print("CLEANING & EDA COMPLETE")
print("=" * 70)
