# Project 8: Customer Segmentation with K-Means Clustering

"""
Description:
This project demonstrates customer segmentation using the K-Means clustering algorithm.
K-Means is an unsupervised machine learning algorithm used to partition n observations
into k clusters, where each observation belongs to the cluster with the nearest mean (centroid).
We will generate a synthetic dataset representing customer behavior and then apply K-Means
to identify distinct customer segments.

Use Case:
Marketing strategy development, personalized recommendations, resource allocation,
identifying customer groups with similar purchasing habits or demographics.

Concepts Covered:
- Unsupervised Learning
- Clustering algorithms
- K-Means clustering algorithm
- Elbow method for determining optimal number of clusters (K)
- Data generation and visualization
- Model training and prediction with scikit-learn
- Interpreting clusters
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs # For generating synthetic data

# 1. Generate Synthetic Customer Data
# We'll create a dataset with distinct clusters to simulate different customer segments.
# Each cluster represents a type of customer based on two features: 'Annual Income' and 'Spending Score'.
print("\nGenerating synthetic customer data...")
X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=42)

# Convert to DataFrame for better readability
data = pd.DataFrame({
    'Annual Income (k$)': X[:, 0] * 10 + 50, # Scale to more realistic income values
    'Spending Score (1-100)': X[:, 1] * 10 + 50 # Scale to more realistic spending scores
})

print("Synthetic Customer Data Head:")
print(data.head())
print("\nDataset Description:")
print(data.describe())

# Visualize the raw data (before clustering)
plt.figure(figsize=(10, 7))
plt.scatter(data['Annual Income (k$)'], data['Spending Score (1-100)'], s=50, alpha=0.7)
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.title('Raw Customer Data (Before Clustering)')
plt.grid(True)
plt.show()

# 2. Preprocess Data: Scaling
# K-Means is sensitive to the scale of features, so we standardize the data.
print("\nScaling customer data...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data)

# 3. Determine Optimal Number of Clusters (K) using the Elbow Method
# The Elbow Method plots the WCSS (Within-Cluster Sum of Squares) against the number of clusters (K).
# The "elbow" point in the plot indicates the optimal K.
print("\nApplying Elbow Method to find optimal K...")
wcss = [] # Within-Cluster Sum of Squares
for i in range(1, 11): # Test K from 1 to 10
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_) # inertia_ is the WCSS

plt.figure(figsize=(10, 7))
plt.plot(range(1, 11), wcss, marker='o', linestyle='--')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.title('Elbow Method for Optimal K')
plt.xticks(range(1, 11))
plt.grid(True)
plt.show()

print("Based on the elbow plot, choose an appropriate K (e.g., 4 for this synthetic data).")
optimal_k = 4 # Let's assume 4 from the plot for this example

# 4. Apply K-Means Clustering with the Optimal K
print(f"\nApplying K-Means with K = {optimal_k}...")
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', max_iter=300, n_init=10, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

# Add cluster labels to the original DataFrame
data['Cluster'] = clusters

print("\nCustomer Data with Cluster Labels (Head):")
print(data.head())

# 5. Visualize the Clusters
plt.figure(figsize=(10, 7))
# Plot each cluster with a different color
for i in range(optimal_k):
    plt.scatter(
        data[data['Cluster'] == i]['Annual Income (k$)'],
        data[data['Cluster'] == i]['Spending Score (1-100)'],
        s=50, label=f'Cluster {i+1}', alpha=0.7
    )
# Plot the cluster centroids
centroids = scaler.inverse_transform(kmeans.cluster_centers_)
plt.scatter(
    centroids[:, 0],
    centroids[:, 1],
    s=200, marker='X', c='black', edgecolor='white', label='Centroids'
)

plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.title(f'Customer Segments (K-Means with K={optimal_k})')
plt.legend()
plt.grid(True)
plt.show()

# 6. Interpret the Clusters
# Analyze the characteristics of each cluster based on the original features.
print("\nCluster Analysis (Mean values for each feature per cluster):")
cluster_means = data.groupby('Cluster').mean()
print(cluster_means)

# Example interpretation:
# - Cluster 0: Low income, low spending (Frugal)
# - Cluster 1: High income, high spending (Spenders)
# - Cluster 2: Low income, high spending (Target Customers)
# - Cluster 3: High income, low spending (Careful Savers)

print("\nInterpretation of clusters helps in tailoring marketing strategies.")