EMMISION PREDICTION MODEL


#INDIA SPECIFIC MODEL
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from prophet import Prophet
import os

# -------------------------------
# 1. Data Generation & Preprocessing (Indian Standards)
# -------------------------------

def generate_data():
    """
    Generate synthetic SO₂ emission data based on Indian standards and save it to CSV.
    """
    np.random.seed(42)

    # Date range (monthly from 2000 to 2023)
    dates = pd.date_range(start='2000-01-01', end='2023-12-31', freq='ME')

    # Fuel types based on Indian power plants
    fuel_types = ['Domestic Coal', 'Imported Coal', 'Lignite', 'Natural Gas']
    regions = ['North', 'South', 'East', 'West']

    n_samples = 1000
    fuel_choice = np.random.choice(fuel_types, n_samples)

    energy_output = []
    so2_emissions = []
    fuel_cost = []

    for fuel in fuel_choice:
        if fuel == 'Domestic Coal':
            energy_output.append(np.random.uniform(300, 1500))
            so2_emissions.append(np.random.uniform(100, 600))  # Indian SO₂ norms (mg/Nm³)
            fuel_cost.append(np.random.uniform(2000, 5000))  # INR per ton
        elif fuel == 'Imported Coal':
            energy_output.append(np.random.uniform(500, 2000))
            so2_emissions.append(np.random.uniform(50, 300))
            fuel_cost.append(np.random.uniform(3000, 7000))
        elif fuel == 'Lignite':
            energy_output.append(np.random.uniform(200, 1000))
            so2_emissions.append(np.random.uniform(400, 800))
            fuel_cost.append(np.random.uniform(1000, 3000))
        elif fuel == 'Natural Gas':
            energy_output.append(np.random.uniform(100, 800))
            so2_emissions.append(np.random.uniform(10, 100))
            fuel_cost.append(np.random.uniform(4000, 9000))

    df = pd.DataFrame({
        'date': np.random.choice(dates, n_samples),
        'fuel_type': fuel_choice,
        'energy_output': energy_output,
        'so2_emissions': so2_emissions,
        'fuel_cost': fuel_cost,
        'region': np.random.choice(regions, n_samples)
    })

    df.to_csv('so2_data_india.csv', index=False)
    print("Synthetic data generated for Indian standards and saved as 'so2_data_india.csv'.")

def preprocess_data():
    df = pd.read_csv('so2_data_india.csv')
    df['date'] = pd.to_datetime(df['date'])

    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['fuel_cost_lag1'] = df.groupby('fuel_type')['fuel_cost'].shift(1)
    df['so2_rolling_avg'] = df.groupby('region')['so2_emissions'].transform(lambda x: x.rolling(12).mean())

    df['fuel_cost_lag1'] = df['fuel_cost_lag1'].fillna(df['fuel_cost'])
    df['so2_rolling_avg'] = df['so2_rolling_avg'].fillna(df['so2_emissions'])
    return df

# -------------------------------
# 2. Hybrid Model (Prophet + XGBoost)
# -------------------------------

class HybridModel:
    def __init__(self):
        self.prophet = Prophet(seasonality_mode='multiplicative')
        self.xgb = XGBRegressor(n_estimators=300, learning_rate=0.01)
        self.preprocessor = None

    def fit(self, df):
        prophet_df = df[['date', 'so2_emissions']].rename(columns={'date': 'ds', 'so2_emissions': 'y'})
        self.prophet.fit(prophet_df)
        prophet_forecast = self.prophet.predict(prophet_df)

        X = df.drop(['so2_emissions', 'date'], axis=1)
        y_residuals = df['so2_emissions'] - prophet_forecast['yhat'].values

        self.preprocessor = ColumnTransformer([
            ('num', StandardScaler(), ['year', 'month', 'fuel_cost', 'fuel_cost_lag1', 'so2_rolling_avg']),
            ('cat', OneHotEncoder(handle_unknown='ignore'), ['fuel_type', 'region'])
        ])
        X_processed = self.preprocessor.fit_transform(X)

        self.xgb.fit(X_processed, y_residuals)
        print("Hybrid model trained successfully.")

    def predict(self, df):
        prophet_df = df[['date']].rename(columns={'date': 'ds'})
        prophet_forecast = self.prophet.predict(prophet_df)

        X = df.drop(['date', 'so2_emissions'], axis=1, errors='ignore')
        X_processed = self.preprocessor.transform(X)
        xgb_pred = self.xgb.predict(X_processed)

        return prophet_forecast['yhat'].values + xgb_pred

# -------------------------------
# 3. Training & Saving the Model
# -------------------------------

def train_model():
    if not os.path.exists('so2_data_india.csv'):
        generate_data()

    df = preprocess_data()
    train_size = int(0.8 * len(df))
    train_df = df.iloc[:train_size]
    test_df = df.iloc[train_size:]

    model = HybridModel()
    model.fit(train_df)

    y_test = test_df['so2_emissions']
    y_pred = model.predict(test_df)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"Model Evaluation: MAE = {mae:.2f}")

    joblib.dump(model, 'so2_trained_model_india.pkl')
    print("Trained model saved as 'so2_trained_model_india.pkl'.")

# -------------------------------
# 4. User Input & Prediction
# -------------------------------
def get_user_input():
    """
    Collect user input for prediction and return it as a DataFrame.
    """
    fuel_type = input("Enter fuel type (Domestic Coal, Imported Coal, Lignite, Natural Gas): ").strip()
    region = input("Enter region (North, South, East, West): ").strip()
    year = int(input("Enter year: ").strip())
    date=int(input("Enter date: ").strip())
    month = int(input("Enter month (1-12): ").strip())
    fuel_cost = float(input("Enter fuel cost (INR per ton): ").strip())

    # Generate a date based on user input (end of month to match dataset)
    date = pd.to_datetime(f"{year}-{month}-28")  # Using 28 to avoid invalid dates

    data = pd.DataFrame({
        'date': [date],  # Add the missing 'date' column
        'fuel_type': [fuel_type],
        'region': [region],
        'year': [year],
        'month': [month],
        'fuel_cost': [fuel_cost],
        'fuel_cost_lag1': [fuel_cost],  # Assuming current cost for lag1
        'so2_rolling_avg': [100]  # Placeholder, should ideally be based on past data
    })

    return data


def predict_from_user_input():
    if not os.path.exists('so2_trained_model_india.pkl'):
        print("No trained model found. Please train the model first.")
        return

    model = joblib.load('so2_trained_model_india.pkl')
    print("Model loaded successfully!")

    user_data = get_user_input()
    prediction = model.predict(user_data)

    print("\nPrediction Results:")
    print(f"Predicted SO₂ Emissions: {prediction[0]:.2f} mg/Nm³")

if __name__ == "__main__":
    print("Select an option:")
    print("1: Train Model")
    print("2: Predict SO₂ Emission")

    choice = input("Enter 1 or 2: ").strip()

    if choice == "1":
        train_model()
    elif choice == "2":
        predict_from_user_input()
    else:
        print("Invalid option. Exiting.")