#!/usr/bin/env python3

import json
import argparse
import psycopg2 # Example for PostgreSQL. Replace with appropriate driver for other DBs.
import os

# ==========================================================================
# SCRIPT: rds_ingest_json.py
# DESCRIPTION: Ingests data from a JSON file into an Amazon RDS PostgreSQL
#              database table. This script assumes the JSON file contains
#              a list of objects, where each object's keys map directly to
#              the target table's column names.
#
# USE CASE SCENARIO:
# A data pipeline generates daily reports or user activity data in JSON format.
# This data needs to be loaded into a relational database (RDS PostgreSQL)
# for further analysis or to be consumed by an application. This script automates
# the process of reading the JSON and inserting it into the database.
#
# PREREQUISITES:
# 1.  **Python Libraries:** The following Python libraries must be installed:
#     - `psycopg2-binary`: PostgreSQL adapter for Python (`pip install psycopg2-binary`)
#       (For MySQL, use `pymysql`; for SQL Server, use `pyodbc` or `pymssql`)
# 2.  **IAM Permissions:** The principal executing this script needs network access
#     to the RDS instance. Ensure the security group of the RDS instance allows
#     inbound connections from where this script is run.
# 3.  **Existing Resources:**
#     - An Amazon RDS PostgreSQL instance.
#     - A database and a table within that instance, with column names matching
#       the keys in your JSON objects.
#     - A local JSON file containing the data to be ingested.
#
# HOW TO USE:
# 1.  **Save the script:** Save this content as `rds_ingest_json.py`.
# 2.  **Make it executable (Linux/macOS):** `chmod +x rds_ingest_json.py`
# 3.  **Prepare your JSON data:** Create a JSON file (e.g., `users.json`)
#     containing a list of objects. Example:
#     `[{"id": 1, "name": "Alice", "email": "alice@example.com"}, {"id": 2, "name": "Bob", "email": "bob@example.com"}]`
# 4.  **Run from your terminal:**
#     python rds_ingest_json.py \
#       --db-host "my-rds-instance.abcdefghijk.us-east-1.rds.amazonaws.com" \
#       --db-port 5432 \
#       --db-name "mydb" \
#       --db-user "dbadmin" \
#       --db-password "MySecurePassword123!" \
#       --table-name "users" \
#       --json-file-path "./users.json"
#
#     **Arguments:**
#     - `--db-host`: The hostname or IP address of the RDS instance.
#     - `--db-port`: The port number of the database (default: 5432 for PostgreSQL).
#     - `--db-name`: The name of the database to connect to.
#     - `--db-user`: The username to connect to the database.
#     - `--db-password`: The password for the database user.
#     - `--table-name`: The name of the target table in the database where data will be inserted.
#     - `--json-file-path`: The path to the local JSON file containing the data.
#
# IMPORTANT CONSIDERATIONS:
# - This script assumes the target table already exists and its schema matches the JSON keys.
# - For large datasets, consider using `COPY` command (PostgreSQL) or `LOAD DATA INFILE` (MySQL)
#   for better performance, potentially staging data in S3 first.
# - Error handling for individual record insertion is basic (rollback per item). For production,
#   consider batching inserts and more robust error logging.
# - Ensure the database user has `INSERT` permissions on the target table.
# ==========================================================================

def ingest_json_to_rds(
    db_host: str,
    db_port: int,
    db_name: str,
    db_user: str,
    db_password: str,
    table_name: str,
    json_file_path: str
):
    """
    Ingests data from a JSON file into an RDS PostgreSQL database table.

    This function reads a JSON file expected to contain a list of dictionaries.
    Each dictionary is treated as a row, and its keys are mapped to table column names.
    It connects to the specified RDS PostgreSQL database and inserts the data.

    Args:
        db_host (str): The hostname or IP address of the RDS instance.
        db_port (int): The port number of the database (e.g., 5432 for PostgreSQL).
        db_name (str): The name of the database to connect to.
        db_user (str): The username for database authentication.
        db_password (str): The password for the database user.
        table_name (str): The name of the target table in the database where data will be inserted.
        json_file_path (str): The absolute or relative path to the JSON file containing the data.
    """
    print(f"Starting data ingestion from '{json_file_path}' to RDS table '{table_name}'...")

    # ==========================================================================
    # STEP 1: Read and parse the JSON data from the specified file.
    # The script expects the JSON file to contain a list of objects.
    # ==========================================================================
    print(f"\n>>> Step 1: Reading data from JSON file: '{json_file_path}'...")
    data_to_ingest = None
    try:
        # Check if the file exists before attempting to open it.
        if not os.path.exists(json_file_path):
            print(f"Error: Local JSON file not found at {json_file_path}. Please check the path. Exiting.")
            return

        with open(json_file_path, 'r') as f:
            data_to_ingest = json.load(f)
        print(f"   Successfully loaded data from {json_file_path}.")
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in {json_file_path}. Please ensure it's valid JSON. Exiting.")
        return
    except Exception as e:
        print(f"An unexpected error occurred while reading the JSON file: {e}. Exiting.")
        return

    # Validate that the loaded data is a list and not empty.
    if not isinstance(data_to_ingest, list):
        print("Error: JSON file should contain a list of objects for ingestion. Exiting.")
        return

    if not data_to_ingest:
        print("No data found in JSON file to ingest. Exiting.")
        return

    # ==========================================================================
    # STEP 2: Connect to the RDS PostgreSQL database.
    # This uses the `psycopg2` library to establish a connection.
    # ==========================================================================
    print(f"\n>>> Step 2: Connecting to RDS database '{db_name}' at {db_host}:{db_port}...")
    conn = None
    try:
        conn = psycopg2.connect(
            host=db_host,
            port=db_port,
            database=db_name,
            user=db_user,
            password=db_password
        )
        cur = conn.cursor() # Create a cursor object to execute SQL commands.
        print("   Successfully connected to RDS database.")

        # ==========================================================================
        # STEP 3: Prepare the SQL INSERT statement and ingest data.
        # It dynamically constructs the INSERT statement based on JSON keys.
        # ==========================================================================
        print(f"\n>>> Step 3: Ingesting data into table '{table_name}'...")
        # Get column names from the keys of the first JSON object.
        columns = list(data_to_ingest[0].keys())
        # Format column names for SQL (e.g., "column_name").
        columns_str = ', '.join([f'\"{\"}\"'.format(col) for col in columns]) 
        # Create placeholders for the values in the SQL query (e.g., %s, %s).
        placeholders = ', '.join(['%s'] * len(columns))

        # Construct the full INSERT SQL statement.
        insert_sql = f"INSERT INTO \"{table_name}\" ({columns_str}) VALUES ({placeholders})"

        ingested_count = 0
        # Iterate through each item (row) in the JSON data.
        for item in data_to_ingest:
            # Extract values from the JSON object in the order of columns.
            values = [item.get(col) for col in columns]
            try:
                # Execute the INSERT statement for each item.
                cur.execute(insert_sql, values)
                ingested_count += 1
            except Exception as e:
                # If an error occurs for a specific item, print it and rollback the transaction for that item.
                # For production, consider more robust error logging and potentially skipping problematic rows.
                print(f"      Error inserting item: {item}. Error: {e}")
                conn.rollback() # Rollback the current transaction to prevent partial inserts.

        conn.commit() # Commit all successful insertions to the database.
        print(f"   Successfully ingested {ingested_count} records into table '{table_name}'.")

    except psycopg2.Error as e:
        # Catch specific PostgreSQL errors.
        print(f"Database connection or query error: {e}")
    except Exception as e:
        # Catch any other unexpected errors during database operations.
        print(f"An unexpected error occurred during database operations: {e}")
    finally:
        # Ensure the database connection is closed, regardless of success or failure.
        if conn:
            cur.close()
            conn.close()
            print("   Database connection closed.")

    print("\n=== RDS data ingestion completed. ===")

# ==========================================================================
# Main execution block to parse command-line arguments and call the function.
# This allows the script to be run from the command line with various options.
# ==========================================================================
if __name__ == "__main__":
    # Create an argument parser object.
    parser = argparse.ArgumentParser(
        description="""
        Ingests data from a JSON file into an Amazon RDS PostgreSQL database table.
        Assumes the JSON file contains a list of objects, where each object's keys
        map directly to table column names.
        """
    )
    # Define the command-line arguments the script expects.
    parser.add_argument(
        "--db-host", 
        required=True, 
        help="The hostname or IP address of the RDS instance (e.g., 'my-rds-instance.abcdefghijk.us-east-1.rds.amazonaws.com')."
    )
    parser.add_argument(
        "--db-port", 
        type=int, 
        default=5432, 
        help="The port number of the database (default: 5432 for PostgreSQL)."
    )
    parser.add_argument(
        "--db-name", 
        required=True, 
        help="The name of the database to connect to (e.g., 'mydb')."
    )
    parser.add_argument(
        "--db-user", 
        required=True, 
        help="The username for database authentication (e.g., 'dbadmin')."
    )
    parser.add_argument(
        "--db-password", 
        required=True, 
        help="The password for the database user."
    )
    parser.add_argument(
        "--table-name", 
        required=True, 
        help="The name of the target table in the database where data will be inserted (e.g., 'users')."
    )
    parser.add_argument(
        "--json-file-path", 
        required=True, 
        help="The path to the local JSON file containing the data (e.g., './data/users.json')."
    )

    # Parse the arguments provided by the user when running the script.
    args = parser.parse_args()

    # Call the main function with the parsed arguments.
    ingest_json_to_rds(
        db_host=args.db_host,
        db_port=args.db_port,
        db_name=args.db_name,
        db_user=args.db_user,
        db_password=args.db_password,
        table_name=args.table_name,
        json_file_path=args.json_file_path
    )
