#!/usr/bin/env python3

import boto3
import argparse

# ==========================================================================
# SCRIPT: s3_migrate_data.py
# DESCRIPTION: Migrates data (objects) from a specified source S3 bucket
#              (or a specific prefix within it) to a destination S3 bucket.
#              This script is useful for tasks like copying production data
#              to a development environment, or moving data between different
#              S3 buckets for organizational or cost optimization purposes.
#
# USE CASE SCENARIO:
# A data engineering team needs to regularly copy a subset of production data
# (e.g., logs from a specific date range) from a production S3 bucket to a
# development S3 bucket for testing new analytics pipelines. This script
# automates the object-by-object copying process.
#
# PREREQUISITES:
# 1.  **AWS Credentials:** The AWS CLI or SDK must be configured with credentials
#     that have the necessary permissions. This can be done via `~/.aws/credentials`,
#     environment variables, or an instance profile/IAM role for EC2 instances.
# 2.  **boto3 Library:** The AWS SDK for Python (`boto3`) must be installed.
#     You can install it using `pip install boto3`.
# 3.  **IAM Permissions:** The principal executing this script must have:
#     - `s3:ListBucket` on the source bucket.
#     - `s3:GetObject` on objects in the source bucket.
#     - `s3:PutObject` on the destination bucket.
# 4.  **Existing Resources:**
#     - A source S3 bucket with data.
#     - A destination S3 bucket.
#
# HOW TO USE:
# 1.  **Save the script:** Save this content as `s3_migrate_data.py`.
# 2.  **Make it executable (Linux/macOS):** `chmod +x s3_migrate_data.py`
# 3.  **Run from your terminal:**
#     python s3_migrate_data.py \
#       --source-bucket "my-prod-data-bucket" \
#       --destination-bucket "my-dev-data-bucket" \
#       --source-prefix "logs/2023/" \
#       --destination-prefix "raw_logs/" \
#       --region "us-east-1"
#
#     **Arguments:**
#     - `--source-bucket`: The name of the S3 bucket to copy data FROM.
#     - `--destination-bucket`: The name of the S3 bucket to copy data TO.
#     - `--source-prefix`: (Optional) A prefix to filter objects in the source bucket.
#                          Only objects starting with this prefix will be copied.
#     - `--destination-prefix`: (Optional) A prefix to add to the objects in the destination bucket.
#                               This effectively places copied objects into a "folder" in the destination.
#     - `--region`: (Optional) The AWS region. Defaults to `us-east-1`.
#
# IMPORTANT CONSIDERATIONS:
# - This script performs object-by-object copying. For very large datasets or frequent syncs,
#   consider using `aws s3 sync` via CLI or AWS DataSync for more robust solutions.
# - Ensure the destination bucket has appropriate lifecycle policies if data retention differs.
# - This script does NOT delete objects from the source bucket.
# ==========================================================================

def migrate_s3_data(
    source_bucket: str,
    destination_bucket: str,
    source_prefix: str = '',
    destination_prefix: str = '',
    region_name: str = 'us-east-1'
):
    """
    Migrates S3 data from a source bucket to a destination bucket.

    This function iterates through objects in the source S3 bucket (optionally filtered by a prefix)
    and copies them to the destination S3 bucket, allowing for a different prefix in the destination.

    Args:
        source_bucket (str): The name of the source S3 bucket from which objects will be copied.
        destination_bucket (str): The name of the destination S3 bucket where objects will be copied to.
        source_prefix (str): Optional. A string prefix to filter the objects in the source bucket.
                             Only objects whose keys start with this prefix will be considered for copying.
                             Defaults to an empty string, meaning all objects in the source bucket.
        destination_prefix (str): Optional. A string prefix to prepend to the keys of the copied objects
                                  in the destination bucket. Defaults to an empty string.
        region_name (str): The AWS region where the S3 buckets are located. Defaults to 'us-east-1'.
    """
    # Initialize the boto3 S3 client for programmatic access to AWS S3 service.
    s3_client = boto3.client('s3', region_name=region_name)

    print(f"Starting S3 data migration from s3://{source_bucket}/{source_prefix} to s3://{destination_bucket}/{destination_prefix} in region {region_name}...\n")

    # Validate that source and destination buckets are not the same to prevent accidental overwrites
    if source_bucket == destination_bucket and source_prefix == destination_prefix:
        print("Error: Source and destination paths are identical. Aborting migration to prevent self-copying. Exiting.")
        return

    try:
        # Use a paginator to handle large numbers of objects efficiently.
        # list_objects_v2 is used for listing objects in a bucket.
        paginator = s3_client.get_paginator('list_objects_v2')
        pages = paginator.paginate(Bucket=source_bucket, Prefix=source_prefix)

        object_count = 0
        # Iterate through each page of objects returned by the paginator.
        for page in pages:
            # Check if the 'Contents' key exists in the page, indicating there are objects.
            if "Contents" in page:
                # Iterate through each object in the current page.
                for obj in page['Contents']:
                    source_key = obj['Key'] # The full key of the object in the source bucket.

                    # Construct the destination key.
                    # If a source_prefix is provided, remove it from the source_key to get the relative path.
                    # Then, prepend the destination_prefix.
                    if source_prefix and source_key.startswith(source_prefix):
                        relative_key = source_key[len(source_prefix):]
                    else:
                        relative_key = source_key

                    destination_key = f"{destination_prefix}{relative_key}"

                    print(f"   Copying s3://{source_bucket}/{source_key} to s3://{destination_bucket}/{destination_key}")
                    
                    # Perform the copy operation using s3_client.copy_object.
                    # CopySource specifies the source bucket and key.
                    s3_client.copy_object(
                        CopySource={'Bucket': source_bucket, 'Key': source_key},
                        Bucket=destination_bucket,
                        Key=destination_key
                    )
                    object_count += 1
        print(f"\nSuccessfully migrated {object_count} objects.")

    except Exception as e:
        # Catch and print any exceptions that occur during the S3 operations.
        print(f"Error during S3 data migration: {e}")
        return # Exit the function on error.

    print("\n=== S3 data migration completed. ===")

# ==========================================================================
# Main execution block to parse command-line arguments and call the function.
# This allows the script to be run from the command line with various options.
# ==========================================================================
if __name__ == "__main__":
    # Create an argument parser object.
    parser = argparse.ArgumentParser(
        description="""
        Automates the migration of S3 objects from a source bucket to a destination bucket.
        Supports filtering by source prefix and adding a destination prefix.
        """
    )
    # Define the command-line arguments the script expects.
    parser.add_argument(
        "--source-bucket", 
        required=True, 
        help="Name of the source S3 bucket (e.g., 'my-prod-data')."
    )
    parser.add_argument(
        "--destination-bucket", 
        required=True, 
        help="Name of the destination S3 bucket (e.g., 'my-dev-data')."
    )
    parser.add_argument(
        "--source-prefix", 
        default="", 
        help="Optional. Prefix to filter objects in the source bucket (e.g., 'logs/2023/')."
    )
    parser.add_argument(
        "--destination-prefix", 
        default="", 
        help="Optional. Prefix to add to objects in the destination bucket (e.g., 'raw_logs/')."
    )
    parser.add_argument(
        "--region", 
        default="us-east-1", 
        help="AWS region where the S3 buckets are located (default: us-east-1)."
    )

    # Parse the arguments provided by the user when running the script.
    args = parser.parse_args()

    # Call the main function with the parsed arguments.
    migrate_s3_data(
        source_bucket=args.source_bucket,
        destination_bucket=args.destination_bucket,
        source_prefix=args.source_prefix,
        destination_prefix=args.destination_prefix,
        region_name=args.region
    )
