import pandas as pd
import os
import openpyxl
from functools import reduce

# Set working directory
path = (
    r"C:\Users\mauri\OneDrive\Desktop\Documents\My Documents\Job Application Documents\Mophones"
)
os.chdir(path)

# Load each snapshot without adding SnapshotDate
jan = pd.read_excel("Credit Data - 01-01-2025.xlsx"); jan["DATE"] = pd.to_datetime("2025-01-01")
mar = pd.read_excel("Credit Data - 30-03-2025.xlsx"); mar["DATE"] = pd.to_datetime("2025-03-30")
jun = pd.read_excel("Credit Data - 30-06-2025.xlsx"); jun["DATE"] = pd.to_datetime("2025-06-30")
sep = pd.read_excel("Credit Data - 30-09-2025.xlsx"); sep["DATE"] = pd.to_datetime("2025-09-30")
dec = pd.read_excel("Credit Data - 30-12-2025.xlsx"); dec["DATE"] = pd.to_datetime("2025-12-30")
# Combine into one dataframe
credit = pd.concat([jan, mar, jun, sep, dec], ignore_index=True)
credit = credit.loc[:, ~credit.columns.str.contains('^Unnamed')]

# Ensure LOANID is present and consistent
credit["LOANID"] = credit["LOANID"].astype(str)

print("Credit data shape:", credit.shape)


# Load sheets
sales_details = pd.read_excel("Sales and Customer Data.xlsx", sheet_name="Sales Details")
gender        = pd.read_excel("Sales and Customer Data.xlsx", sheet_name="Gender")
dob           = pd.read_excel("Sales and Customer Data.xlsx", sheet_name="DOB")
income        = pd.read_excel("Sales and Customer Data.xlsx", sheet_name="Income Level")

# -------------------------------
# Step 4: Load NPS data
# -------------------------------
nps = pd.read_excel("NPS Data.xlsx")
nps["LOANID"] = nps["LOANID"].astype(str)
nps.drop_duplicates(subset="LOANID", inplace=True)

print("NPS shape:", nps.shape)

# Ensure LOANID is consistent
for df in [sales_details, gender, dob, income]:
    df["LOANID"] = df["LOANID"].astype(str)

# Collapse duplicates: keep first record per LOANID
gender = gender.groupby("LOANID").first().reset_index()
dob    = dob.groupby("LOANID").first().reset_index()
income = income.groupby("LOANID").first().reset_index()


# -------------------------------
# Step 4: Load NPS data
# -------------------------------
nps = pd.read_excel("NPS Data.xlsx")
nps["LOANID"] = nps["LOANID"].astype(str)
nps.drop_duplicates(subset="LOANID", inplace=True)

print("NPS shape:", nps.shape)

Credit data shape: (71456, 33)
NPS shape: (3532, 17)
NPS shape: (3532, 17)

# Merge safely
customers = sales_details.merge(gender, on="LOANID", how="left") \
                         .merge(dob, on="LOANID", how="left") \
                         .merge(income, on="LOANID", how="left")

print("Customers shape:", customers.shape)
# -------------------------------
# Step 5: Merge everything
# -------------------------------
full_data = credit.merge(customers, on="LOANID", how="left") \
                  .merge(nps, on="LOANID", how="left")

print("Full dataset shape:", full_data.shape)

Customers shape: (20747, 27)
Full dataset shape: (71466, 75)

# -----------------------------
# Step 1: Convert date columns
# -----------------------------
# Convert 'date_of_birth' and 'DATE' into proper datetime objects.
# Using errors="coerce" ensures invalid formats are safely converted to NaT.
full_data["date_of_birth"] = pd.to_datetime(full_data["date_of_birth"], format="%Y-%m-%d", errors="coerce")
full_data["DATE"] = pd.to_datetime(full_data["DATE"], format="%Y-%m-%d", errors="coerce")

# -----------------------------
# Step 2: Age calculation
# -----------------------------
# Compute customer age at snapshot date.
# NOTE: Original code subtracted DATE from itself (always zero).
# Correct calculation subtracts year of birth from snapshot year.
full_data["Age"] = full_data["DATE"].dt.year - full_data["date_of_birth"].dt.year

# -----------------------------
# Step 3: Age banding
# -----------------------------
# Define function to group ages into categories.
# Missing ages are labeled "Unknown".
def age_band(age):
    if pd.isna(age): return "Unknown"
    elif age < 26: return "18-25"
    elif age < 36: return "26-35"
    elif age < 46: return "36-45"
    elif age < 56: return "46-55"
    else: return "55+"

# Apply age banding function to Age column.
full_data["AgeBand"] = full_data["Age"].apply(age_band)

# -----------------------------
# Step 4: Income calculation
# -----------------------------
# TotalIncome is the sum of all income sources (mobile money, banks, paybills, etc.).
# AvgMonthlyIncome normalizes income by dividing by employment duration.
full_data["TotalIncome"] = full_data[["Received","Persons Received From Total","Banks Received","Paybills Received Others"]].sum(axis=1)
full_data["AvgMonthlyIncome"] = full_data["TotalIncome"] / full_data["Duration"]

# -----------------------------
# Step 5: Income banding
# -----------------------------
# Categorize customers into income bands for affordability and risk analysis.
bins = [0,5000,10000,20000,30000,50000,100000,150000,999999]
labels = ["<5,000","5,000-9,999","10,000-19,999","20,000-29,999",
          "30,000-49,999","50,000-99,999","100,000-149,999","150,000+"]
full_data["IncomeBand"] = pd.cut(full_data["AvgMonthlyIncome"], 
                                 bins=bins, labels=labels)

# -----------------------------
# Step 6: Data cleaning
# -----------------------------
# Drop rows where IncomeBand is NaN (e.g., missing income or duration).
full_data = full_data.dropna(subset=["IncomeBand"])

# -----------------------------
# Step 7: Output preview
# -----------------------------
# Display first few records with LoanID, AgeBand, AvgMonthlyIncome, and IncomeBand.
print(full_data[["LOANID","AgeBand","AvgMonthlyIncome","IncomeBand"]].head())

               LOANID  AgeBand  AvgMonthlyIncome     IncomeBand
2   recPCzNo2d9qK1VB3  Unknown      66866.623333  50,000-99,999
6   rec6dwN71Phff6ZIW  Unknown     154915.223333       150,000+
11  recZVxmo8rPb9fMZn  Unknown     262233.750000       150,000+
29  recgDvOz8ZFP0gopO  Unknown     259276.500000       150,000+
37  recTocDHUrhpCHoN7  Unknown     441349.393333       150,000+

# Step 1: Convert date columns
# Ensure 'date_of_birth' and 'DATE' are proper datetime objects.
full_data["date_of_birth"] = pd.to_datetime(full_data["date_of_birth"], format="%Y-%m-%d", errors="coerce")
full_data["DATE"] = pd.to_datetime(full_data["DATE"], format="%Y-%m-%d", errors="coerce")

# Step 2: Age calculation
# Compute age at snapshot date (corrected formula).
full_data["Age"] = full_data["DATE"].dt.year - full_data["date_of_birth"].dt.year

# Step 3: Age bands
# Group ages into categories for segmentation.
def age_band(age):
    if pd.isna(age): return "Unknown"
    elif age < 26: return "18-25"
    elif age < 36: return "26-35"
    elif age < 46: return "36-45"
    elif age < 56: return "46-55"
    else: return "55+"

full_data["AgeBand"] = full_data["Age"].apply(age_band)

# Step 4: Income calculation
# Sum income sources and normalize by duration.
full_data["TotalIncome"] = full_data[["Received","Persons Received From Total","Banks Received","Paybills Received Others"]].sum(axis=1)
full_data["AvgMonthlyIncome"] = full_data["TotalIncome"] / full_data["Duration"]

# Step 5: Income bands
# Categorize customers into income ranges.
bins = [0,5000,10000,20000,30000,50000,100000,150000,999999]
labels = ["<5,000","5,000-9,999","10,000-19,999","20,000-29,999","30,000-49,999","50,000-99,999","100,000-149,999","150,000+"]
full_data["IncomeBand"] = pd.cut(full_data["AvgMonthlyIncome"], bins=bins, labels=labels)

# Remove rows where AgeBand is "Unknown"
full_data = full_data[full_data["AgeBand"] != "Unknown"]

# Step 6: Clean data
# Remove rows with missing income bands.
full_data = full_data.dropna(subset=["IncomeBand"])

# Step 7: Preview output
# Show LoanID, AgeBand, AvgMonthlyIncome, and IncomeBand.
print(full_data[["LOANID","AgeBand","AvgMonthlyIncome","IncomeBand"]].head())

                 LOANID AgeBand  AvgMonthlyIncome     IncomeBand
627   rec1WfRVvT8H7sDZO   18-25      12147.086667  10,000-19,999
2239  recYPntN91PIBz0oa   26-35     550066.100833       150,000+
4696  recuwHCtT6fUmuc3M   26-35      50892.300000  50,000-99,999
5176  recxvyalMRb0cgnQz   26-35      34665.815000  30,000-49,999
6998  recY10IF9TZsk6eZ0   36-45      73583.635000  50,000-99,999

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

# ---------------------------------------------
# Step 1: Calculate portfolio health metrics
# ---------------------------------------------
# - ArrearsRate: share of accounts past due
# - RepaymentCoverage: payments received ÷ balances due
# - OutstandingBalance: total loan exposure
# - AdvanceRate: proportion of balances prepaid/advanced
# - PaymentCompliance: actual vs expected payments
# ---------------------------------------------
arrears_rate = credit.groupby("DATE")["ARREARS"].apply(lambda x: (x > 0).mean())
repayment_coverage = credit.groupby("DATE")["TOTAL_PAID"].sum() / credit.groupby("DATE")["TOTAL_DUE_TODAY"].sum()
outstanding_balance = credit.groupby("DATE")["BALANCE"].sum()
advance_rate = (credit.groupby("DATE")["ADVANCE"].sum() + credit.groupby("DATE")["PREPAYMENT_AMOUNT"].sum()) / credit.groupby("DATE")["BALANCE"].sum()
payment_compliance = credit.groupby("DATE")["PAYMENT_AMOUNT"].sum() / credit.groupby("DATE")["EXPECTED_PAYMENT"].sum()

portfolio_metrics = pd.DataFrame({
    "ArrearsRate": arrears_rate,
    "OutstandingBalance": outstanding_balance,
    "RepaymentCoverage": repayment_coverage,
    "advanceRate": advance_rate,
    "PaymentCompliance": payment_compliance
})

print(portfolio_metrics)

# ---------------------------------------------
# Step 2: Visualize portfolio health trends
# ---------------------------------------------
# Left chart: repayment discipline and risk indicators
#   (ArrearsRate, RepaymentCoverage, AdvanceRate, PaymentCompliance)
# Right chart: portfolio exposure trend
#   (OutstandingBalance, formatted in millions)
# ---------------------------------------------
fig, ax = plt.subplots(1, 2, figsize=(14,6))

portfolio_metrics[[
    "ArrearsRate","RepaymentCoverage","advanceRate",
    "PaymentCompliance"]].plot(ax=ax[0], marker="o")
ax[0].set_title("Portfolio Health Metrics Over Time")
ax[0].set_ylabel("Rate")

portfolio_metrics["OutstandingBalance"].plot(ax=ax[1], marker="o", 
                                             color="green")
ax[1].set_title("Outstanding Balance Over Time")
ax[1].set_ylabel("Balance (Millions)")

ax[1].yaxis.set_major_formatter(
    mticker.FuncFormatter(lambda x, _: f'{x/1_000_000:.1f}'))

plt.tight_layout()
plt.close()

            ArrearsRate  OutstandingBalance  RepaymentCoverage  advanceRate  \
DATE                                                                          
2025-01-01     0.607499         342087292.0           0.753263     0.013286   
2025-03-30     0.611665         399987192.4           0.728353     0.014923   
2025-06-30     0.584479         478068036.4           0.714648     0.014065   
2025-09-30     0.564101         546661562.8           0.715136     0.013841   
2025-12-30     0.569232         668547758.2           0.723508     0.013835   

            PaymentCompliance  
DATE                           
2025-01-01           0.445495  
2025-03-30           0.534812  
2025-06-30           0.625038  
2025-09-30           0.553470  
2025-12-30           0.013367

# Arrears rates by income band show higher risk among lower‑income customers.
segment_arrears = full_data.groupby(["DATE","IncomeBand"])["ARREARS"].apply(lambda x: (x>0).mean()).unstack()

segment_arrears.plot(marker="o", figsize=(10,6))
plt.title("Arrears Rate by Income Band")
plt.ylabel("Rate")
plt.close()

# Rename NPS column for clarity
full_data.rename(columns={
    'Using a scale from 0 (not likely) to 10 (very likely), how likely are you to recommend MoPhones to friends or family?': 'NPS Score'
}, inplace=True)

# Categorize NPS scores into Promoter, Passive, Detractor
def categorize_nps(score):
    if score >= 9: return "Promoter"
    elif score >= 7: return "Passive"
    else: return "Detractor"

full_data["NPS_Category"] = full_data["NPS Score"].apply(categorize_nps)

# Average NPS by arrears status
nps_by_arrears = full_data.groupby("BALANCE_DUE_STATUS")["NPS Score"].mean()

# Average NPS by account status
nps_by_status = full_data.groupby("ACCOUNT_STATUS_L2")["NPS Score"].mean()

# NPS distribution (%) by arrears
arrears_dist = full_data.groupby("BALANCE_DUE_STATUS")["NPS_Category"].value_counts(normalize=True).unstack().fillna(0) * 100

# NPS distribution (%) by account status
status_dist = full_data.groupby("ACCOUNT_STATUS_L2")["NPS_Category"].value_counts(normalize=True).unstack().fillna(0) * 100

import matplotlib.pyplot as plt
fig, ax = plt.subplots(2, 2, figsize=(14,10))

# Plot average NPS by arrears
nps_by_arrears.plot(kind="bar", ax=ax[0,0], color="tomato", width=0.6)
ax[0,0].set_title("Average NPS by Arrears")
ax[0,0].set_ylabel("NPS Score")
ax[0,0].set_xticklabels(nps_by_arrears.index, rotation=0)
for i, val in enumerate(nps_by_arrears):
    ax[0,0].text(i, val/2, f"{val:.1f}", ha="center", va="center", color="white", fontweight="bold")

# Plot average NPS by account status
nps_by_status.plot(kind="bar", ax=ax[0,1], color="skyblue", width=0.6)
ax[0,1].set_title("Average NPS by Account Status")
ax[0,1].set_ylabel("NPS Score")
ax[0,1].set_xticklabels(nps_by_status.index, rotation=0)
for i, val in enumerate(nps_by_status):
    ax[0,1].text(i, val/2, f"{val:.1f}", ha="center", va="center", color="black", fontweight="bold")

# Plot NPS distribution by arrears
arrears_dist.plot(kind="bar", ax=ax[1,0], width=0.6)
ax[1,0].set_title("NPS Distribution by Arrears (%)")
ax[1,0].set_ylabel("Percentage")
ax[1,0].set_xticklabels(arrears_dist.index, rotation=0)
for container in ax[1,0].containers:
    ax[1,0].bar_label(container, fmt="%.1f")

# Plot NPS distribution by account status
status_dist.plot(kind="bar", ax=ax[1,1], width=0.6)
ax[1,1].set_title("NPS Distribution by Account Status (%)")
ax[1,1].set_ylabel("Percentage")
ax[1,1].set_xticklabels(status_dist.index, rotation=0)
for container in ax[1,1].containers:
    ax[1,1].bar_label(container, fmt="%.1f")

plt.tight_layout()
plt.close()

# %pip install graphviz
from graphviz import Digraph
from IPython.display import SVG

# Initialize diagram with SVG output
erd = Digraph('MoPhones_ERD', format='svg')
erd.attr(rankdir='LR', size='8,5', fontname="Arial")

# Define entities with professional font, larger size, and colors
erd.node('Customer',
         "Customer\n"
         "-----------------\n"
         "LoanID (PK)\n"
         "Name, Gender, DOB\n"
         "IncomeLevel, EmploymentDuration\n"
         "AgeBand, IncomeBand\n"
         "[Demographic Data]",
         shape='box', style='filled', fillcolor='lightblue',
         fontname="Arial", fontsize="14")

erd.node('Sales',
         "Sales\n"
         "-----------------\n"
         "SaleID (PK)\n"
         "LoanID (FK)\n"
         "TransactionDate\n"
         "Product, Amount\n"
         "[Transaction Data]",
         shape='box', style='filled', fillcolor='lightgreen',
         fontname="Arial", fontsize="14")

erd.node('CreditSnapshot',
         "CreditSnapshot\n"
         "-----------------\n"
         "SnapshotID (PK)\n"
         "LoanID (FK)\n"
         "SnapshotDate\n"
         "Balance, Arrears, Status\n"
         "[Quarterly Snapshots]",
         shape='box', style='filled', fillcolor='orange',
         fontname="Arial", fontsize="14")

erd.node('NPSResponse',
         "NPSResponse\n"
         "-----------------\n"
         "ResponseID (PK)\n"
         "LoanID (FK)\n"
         "SurveyDate\n"
         "NPSScore, Comments\n"
         "[Survey Data]",
         shape='box', style='filled', fillcolor='violet',
         fontname="Arial", fontsize="14")

# Define relationships with clearer labels
erd.edge('Customer', 'Sales', label='Purchases', color='green',
         fontname="Arial", fontsize="12")
erd.edge('Customer', 'CreditSnapshot', label='Credit History', color='orange',
         fontname="Arial", fontsize="12")
erd.edge('Customer', 'NPSResponse', label='Satisfaction Surveys', color='purple',
         fontname="Arial", fontsize="12")

# Render diagram to SVG and display inline in Jupyter
erd.render('MoPhones_ERD')

'MoPhones_ERD.svg'

MoPhones | Case Study

Data Analyst – Product & Credit

Data Loading

Data Integration

Data Preparation

Portfolio Health Metrics

Portfolio Health Trends

Segment Analysis & Operational Implications

Collections Discipline vs Customer Satisfaction

Collections Discipline vs Customer Satisfaction (Cont.)

Data Quality – Limitations

Recommendations for Improvement

MoPhones Data Model for Repeatable Reporting