
#!/usr/bin/env python3

import boto3
import json
import argparse
import requests
from requests_aws4auth import AWS4Auth

# ==========================================================================
# SCRIPT: opensearch_ingest_json.py
# DESCRIPTION: Ingests data from a JSON file into an Amazon OpenSearch Service domain.
#              This script supports both single JSON objects and lists of JSON objects
#              for bulk indexing. It handles AWS SigV4 authentication for secure
#              communication with the OpenSearch domain.
#
# USE CASE SCENARIO:
# A data analytics team has collected application logs or sensor data in JSON format
# and needs to load this data into an OpenSearch domain for real-time search and
# analysis. This script provides a programmatic way to automate that ingestion.
#
# PREREQUISITES:
# 1.  **AWS Credentials:** The AWS CLI or SDK must be configured with credentials
#     that have the necessary permissions. This can be done via `~/.aws/credentials`,
#     environment variables, or an instance profile/IAM role for EC2 instances.
# 2.  **Python Libraries:** The following Python libraries must be installed:
#     - `boto3`: AWS SDK for Python (`pip install boto3`)
#     - `requests`: HTTP library (`pip install requests`)
#     - `requests_aws4auth`: For AWS SigV4 authentication (`pip install requests_aws4auth`)
# 3.  **IAM Permissions:** The principal executing this script must have:
#     - `es:ESHttpPut` on the OpenSearch domain.
#     - `es:ESHttpPost` on the OpenSearch domain.
#     - `es:ESHttpHead` on the OpenSearch domain.
# 4.  **Existing Resources:**
#     - An Amazon OpenSearch Service domain that is accessible from where the script is run.
#     - A JSON file containing the data to be ingested.
#
# HOW TO USE:
# 1.  **Save the script:** Save this content as `opensearch_ingest_json.py`.
# 2.  **Make it executable (Linux/macOS):** `chmod +x opensearch_ingest_json.py`
# 3.  **Prepare your JSON data:** Create a JSON file (e.g., `data.json`).
#     - For a single document:
#       `{"id": "1", "title": "My First Document", "content": "This is some content."}`
#     - For multiple documents (a list of JSON objects):
#       `[{"id": "2", "title": "Second Doc", "content": "More content here."}, {"id": "3", "title": "Third Doc", "content": "Even more content."}]`
# 4.  **Run from your terminal:**
#     python opensearch_ingest_json.py \
#       --opensearch-endpoint "https://search-your-domain-id.us-east-1.es.amazonaws.com" \
#       --index-name "my-app-data" \
#       --json-file-path "./data.json" \
#       --region "us-east-1" \
#       --master-user "masteruser" \
#       --master-password "MasterPassword123!" # Only if basic auth is enabled
#
#     **Arguments:**
#     - `--opensearch-endpoint`: The full HTTPS endpoint of your OpenSearch domain.
#     - `--index-name`: The name of the OpenSearch index to ingest data into.
#     - `--json-file-path`: The path to the local JSON file containing the data.
#     - `--region`: (Optional) The AWS region of the OpenSearch domain. Defaults to `us-east-1`.
#     - `--master-user`: (Optional) Master username for basic authentication (if enabled on your domain).
#     - `--master-password`: (Optional) Master password for basic authentication (if enabled on your domain).
#
# IMPORTANT CONSIDERATIONS:
# - For production environments, consider using a dedicated ingestion pipeline (e.g., Kinesis Data Firehose,
#   Logstash, or a custom Lambda function) for continuous data streams.
# - This script performs basic indexing. For complex data transformations or large-scale bulk indexing,
#   consider using OpenSearch's Bulk API or AWS Glue.
# - Ensure your OpenSearch domain's access policy allows the IAM principal running this script to index data.
# ==========================================================================

def ingest_json_to_opensearch(
    opensearch_endpoint: str,
    index_name: str,
    json_file_path: str,
    region_name: str = 'us-east-1',
    master_user: str = None,
    master_password: str = None
):
    """
    Ingests data from a JSON file into an Amazon OpenSearch Service domain.

    This function reads a JSON file, which can contain either a single JSON object
    or a list of JSON objects. It then indexes these documents into the specified
    OpenSearch index. Authentication is handled using AWS SigV4 by default, with
    an option for basic authentication if configured on the OpenSearch domain.

    Args:
        opensearch_endpoint (str): The full HTTPS endpoint of your OpenSearch domain.
                                   Example: 'https://search-your-domain-id.us-east-1.es.amazonaws.com'.
        index_name (str): The name of the OpenSearch index to ingest data into.
                          If the index does not exist, OpenSearch will create it upon first ingestion.
        json_file_path (str): The absolute or relative path to the JSON file containing the data.
        region_name (str): The AWS region where the OpenSearch domain is deployed. Defaults to 'us-east-1'.
        master_user (str): Optional. The master username for basic authentication if your OpenSearch domain
                           is configured to use it. If provided, basic auth will be used instead of SigV4.
        master_password (str): Optional. The master password for basic authentication. Required if `master_user` is provided.
    """
    print(f"Starting data ingestion to OpenSearch index '{index_name}' at '{opensearch_endpoint}' in region {region_name}...\n")

    # ==========================================================================
    # STEP 1: Set up authentication for the OpenSearch request.
    # We prioritize basic authentication if credentials are provided, otherwise use AWS SigV4.
    # ==========================================================================
    service = 'es' # The service name for OpenSearch in AWS SigV4 authentication.
    auth = None
    headers = {"Content-Type": "application/json"}

    if master_user and master_password:
        # If master user/password are provided, use basic authentication.
        print("   Using basic authentication for OpenSearch.")
        auth = (master_user, master_password)
    else:
        # Otherwise, use AWS SigV4 authentication. This requires boto3 and requests_aws4auth.
        # It automatically signs the HTTP requests with your AWS credentials.
        print("   Using AWS SigV4 authentication for OpenSearch.")
        credentials = boto3.Session().get_credentials()
        auth = AWS4Auth(credentials.access_key, credentials.secret_key, region_name, service, session_token=credentials.token)

    # ==========================================================================
    # STEP 2: Read and parse the JSON data from the specified file.
    # The script expects either a single JSON object or a list of JSON objects.
    # ==========================================================================
    print(f">>> Step 2: Reading data from JSON file: '{json_file_path}'...")
    data_to_ingest = None
    try:
        with open(json_file_path, 'r') as f:
            data_to_ingest = json.load(f)
        print(f"   Successfully loaded data from {json_file_path}.")
    except FileNotFoundError:
        print(f"Error: JSON file not found at {json_file_path}. Please check the path. Exiting.")
        return
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in {json_file_path}. Please ensure it's valid JSON. Exiting.")
        return
    except Exception as e:
        print(f"An unexpected error occurred while reading the JSON file: {e}. Exiting.")
        return

    # ==========================================================================
    # STEP 3: Ingest the data into the OpenSearch domain.
    # It handles both single documents and lists of documents.
    # ==========================================================================
    print(f"\n>>> Step 3: Ingesting data into OpenSearch index '{index_name}'...")
    try:
        # If the JSON file contains a list of objects, iterate and index each one.
        if isinstance(data_to_ingest, list):
            print(f"   Found {len(data_to_ingest)} documents to index.")
            for i, doc in enumerate(data_to_ingest):
                # Use 'id' field from the document if present, otherwise generate a simple ID.
                doc_id = doc.get('id', str(i + 1)) 
                # Construct the URL for indexing a single document.
                url = f"{opensearch_endpoint}/{index_name}/_doc/{doc_id}"
                
                # Send the PUT request to OpenSearch to index the document.
                r = requests.put(url, auth=auth, headers=headers, data=json.dumps(doc))
                
                # Check the response status code.
                if r.status_code not in [200, 201]:
                    print(f"      Error indexing document {doc_id}: {r.status_code} - {r.text}")
                else:
                    print(f"      Indexed document {doc_id} successfully.")
        # If the JSON file contains a single object, index it.
        elif isinstance(data_to_ingest, dict):
            doc_id = data_to_ingest.get('id', '1') # Use 'id' field if present, else default to '1'.
            url = f"{opensearch_endpoint}/{index_name}/_doc/{doc_id}"
            r = requests.put(url, auth=auth, headers=headers, data=json.dumps(data_to_ingest))
            if r.status_code not in [200, 201]:
                print(f"   Error indexing single document {doc_id}: {r.status_code} - {r.text}")
            else:
                print(f"   Indexed single document {doc_id} successfully.")
        else:
            print("Error: JSON file must contain a single JSON object or a list of JSON objects. Exiting.")
            return

        print("\nData ingestion to OpenSearch completed.")

    except requests.exceptions.RequestException as e:
        # Catch network-related or HTTP request errors.
        print(f"Network or request error during ingestion: {e}")
    except Exception as e:
        # Catch any other unexpected errors.
        print(f"An unexpected error occurred during ingestion: {e}")

# ==========================================================================
# Main execution block to parse command-line arguments and call the function.
# This allows the script to be run from the command line with various options.
# ==========================================================================
if __name__ == "__main__":
    # Create an argument parser object.
    parser = argparse.ArgumentParser(
        description="""
        Ingests data from a JSON file into an Amazon OpenSearch Service domain.
        Supports single JSON objects or lists of objects for indexing.
        """
    )
    # Define the command-line arguments the script expects.
    parser.add_argument(
        "--opensearch-endpoint", 
        required=True, 
        help="The endpoint of your OpenSearch domain (e.g., 'https://search-your-domain-id.us-east-1.es.amazonaws.com')."
    )
    parser.add_argument(
        "--index-name", 
        required=True, 
        help="The name of the OpenSearch index to ingest data into."
    )
    parser.add_argument(
        "--json-file-path", 
        required=True, 
        help="The path to the JSON file containing the data."
    )
    parser.add_argument(
        "--region", 
        default="us-east-1", 
        help="AWS region of the OpenSearch domain (default: us-east-1)."
    )
    parser.add_argument(
        "--master-user", 
        help="Optional. Master username for basic authentication (if enabled on your domain)."
    )
    parser.add_argument(
        "--master-password", 
        help="Optional. Master password for basic authentication (if enabled on your domain)."
    )

    # Parse the arguments provided by the user when running the script.
    args = parser.parse_args()

    # Call the main function with the parsed arguments.
    ingest_json_to_opensearch(
        opensearch_endpoint=args.opensearch_endpoint,
        index_name=args.index_name,
        json_file_path=args.json_file_path,
        region_name=args.region,
        master_user=args.master_user,
        master_password=args.master_password
    )
