# Project 3: Sentiment Analysis with NLTK

"""
Description:
This project introduces basic Natural Language Processing (NLP) concepts by performing
sentiment analysis on text data using Python's NLTK (Natural Language Toolkit) library.
We will use a pre-trained sentiment analyzer (VADER) to classify text as positive, negative, or neutral.
This project is great for understanding text data processing and basic NLP applications.

Use Case:
Analyzing customer reviews, social media posts, or product feedback to gauge public opinion.

Concepts Covered:
- Natural Language Processing (NLP) fundamentals
- Text preprocessing (tokenization, lowercasing)
- Sentiment analysis using NLTK's VADER (Valence Aware Dictionary and sEntiment Reasoner)
- Interpreting sentiment scores
"""

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd

# Download NLTK resources (only needs to be done once)
try:
    nltk.data.find('sentiment/vader_lexicon.zip')
except nltk.downloader.DownloadError:
    print("Downloading NLTK VADER lexicon...")
    nltk.download('vader_lexicon')
    print("Download complete.")

# 1. Prepare Sample Text Data
# We'll use a list of sentences to analyze their sentiment.
print("\nSample Text Data for Sentiment Analysis:")
texts = [
    "This product is absolutely amazing! I love it.",
    "The service was terrible and I am very disappointed.",
    "It's an okay movie, nothing special.",
    "I had a fantastic experience, highly recommend!",
    "Never buying from them again, complete waste of money.",
    "The weather is neither good nor bad today."
]

for i, text in enumerate(texts):
    print(f"- {text}")

# 2. Initialize the VADER Sentiment Analyzer
# VADER is a lexicon and rule-based sentiment analysis tool that is specifically attuned
# to sentiments expressed in social media.
print("\nInitializing VADER Sentiment Analyzer...")
sid = SentimentIntensityAnalyzer()

# 3. Perform Sentiment Analysis
# VADER returns a dictionary of sentiment scores: negative, neutral, positive, and compound.
# The compound score is a normalized, weighted composite score ranging from -1 (most extreme negative) to +1 (most extreme positive).
print("\nPerforming Sentiment Analysis:")
results = []
for text in texts:
    # Get sentiment scores
    scores = sid.polarity_scores(text)
    
    # Determine overall sentiment based on the compound score
    if scores['compound'] >= 0.05:
        sentiment = "Positive"
    elif scores['compound'] <= -0.05:
        sentiment = "Negative"
    else:
        sentiment = "Neutral"
        
    results.append({
        'Text': text,
        'Negative': scores['neg'],
        'Neutral': scores['neu'],
        'Positive': scores['pos'],
        'Compound': scores['compound'],
        'Sentiment': sentiment
    })

# 4. Display Results in a DataFrame for better readability
results_df = pd.DataFrame(results)
print("\nSentiment Analysis Results:")
print(results_df.to_string())

# 5. Analyze a custom sentence
custom_sentence = "This is a decent attempt, but there are areas for improvement."
custom_scores = sid.polarity_scores(custom_sentence)
if custom_scores['compound'] >= 0.05:
    custom_sentiment = "Positive"
elif custom_scores['compound'] <= -0.05:
    custom_sentiment = "Negative"
else:
    custom_sentiment = "Neutral"

print(f"\nAnalysis of custom sentence: \"{custom_sentence}\""
print(f"Scores: {custom_scores}")
print(f"Overall Sentiment: {custom_sentiment}")

# Further Exploration (Optional):
# You can try to implement your own simple lexicon-based sentiment analyzer
# by creating lists of positive and negative words and counting their occurrences.
# Or, explore other NLTK functionalities like tokenization, stemming, or lemmatization.