from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col, avg, count, sum as spark_sum
import pandas as pd
import numpy as np
import seaborn as sns

# Create Spark session
spark = SparkSession.builder \
    .appName("SalesData") \
    .getOrCreate()

print("PySpark SalesData Application")
print(f"Spark Version: {spark.version}")
print("=" * 50)

# Create sample data - sales data
sales_schema = StructType([
    StructField('product_id', IntegerType(), False),
    StructField('product_name', StringType(), True),
    StructField('category', StringType(), True),
    StructField('price', DoubleType(), True),
    StructField('quantity_sold', IntegerType(), True),
    StructField('region', StringType(), True)
])

sales_data = [
    (1, 'Laptop', 'Electronics', 999.99, 5, 'North'),
    (2, 'Mouse', 'Electronics', 29.99, 20, 'North'),
    (3, 'Keyboard', 'Electronics', 79.99, 15, 'South'),
    (4, 'Coffee Mug', 'Kitchen', 12.99, 50, 'East'),
    (5, 'Desk Lamp', 'Furniture', 45.99, 8, 'West'),
    (6, 'Monitor', 'Electronics', 299.99, 12, 'North'),
    (7, 'Coffee Mug', 'Kitchen', 12.99, 30, 'West'),
    (8, 'Laptop', 'Electronics', 999.99, 3, 'South'),
    (9, 'Desk Chair', 'Furniture', 199.99, 6, 'East'),
    (10, 'Mouse', 'Electronics', 29.99, 25, 'North')
]

# Create DataFrame
sales_df = spark.createDataFrame(sales_data, schema=sales_schema)

print("Sample Data:")
sales_df.show(10, truncate=False)

print("\nData Analysis Results:")
print("-" * 30)

# 1. Basic Analysis
print("1. Revenue by Category:")
revenue_by_category = sales_df.groupBy("category") \
    .agg(spark_sum(col("price") * col("quantity_sold")).alias("total_revenue")) \
    .orderBy("total_revenue", ascending=False)
revenue_by_category.show()

print("\n2. Top Selling Products:")
top_products = sales_df.groupBy("product_name") \
    .agg(spark_sum("quantity_sold").alias("total_quantity")) \
    .orderBy("total_quantity", ascending=False)
top_products.show()

# Convert to Pandas for advanced analysis
pandas_df = sales_df.toPandas()
pandas_df['revenue'] = pandas_df['price'] * pandas_df['quantity_sold']

# 3. Advanced Statistical Analysis
print("\n3. Statistical Analysis:")
prices = pandas_df['price'].values
quantities = pandas_df['quantity_sold'].values
revenues = pandas_df['revenue'].values

print(f"Price Stats: Mean=${np.mean(prices):.2f}, Median=${np.median(prices):.2f}, Std=${np.std(prices):.2f}")
print(f"Revenue Stats: Total=${np.sum(revenues):.2f}, Mean=${np.mean(revenues):.2f}")

# Correlation analysis
correlation_matrix = pandas_df[['price', 'quantity_sold', 'revenue']].corr()
print(f"Correlations: Price-Qty={correlation_matrix.loc['price', 'quantity_sold']:.3f}, Price-Revenue={correlation_matrix.loc['price', 'revenue']:.3f}")

# Business insights
best_category = pandas_df.groupby('category')['revenue'].sum().idxmax()
best_product = pandas_df.groupby('product_name')['revenue'].sum().idxmax()
print(f"Best performers: Category={best_category}, Product={best_product}")

# Outlier detection
Q1, Q3 = np.percentile(prices, [25, 75])
IQR = Q3 - Q1
outliers = pandas_df[(prices < Q1 - 1.5*IQR) | (prices > Q3 + 1.5*IQR)]
print(f"Price outliers: {len(outliers)} products")

print("\nPySpark SalesData completed successfully!")
print("=" * 50)

# Stop Spark session
spark.stop()