| import json | |
| import matplotlib.pyplot as plt | |
| def count_reviews_by_stars_and_average(file_path): | |
| star_counts = {} | |
| total_stars = 0 | |
| total_reviews = 0 | |
| total_text_length = 0 | |
| short_text_stars = 0 | |
| short_text_count = 0 | |
| word_frequencies = {} | |
| word_count_limit = 100000 | |
| star_vote_totals = {stars: {'useful': 0, 'funny': 0, 'cool': 0, 'count': 0} for stars in range(1, 6)} | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| for i, line in enumerate(file): | |
| record = json.loads(line.strip()) | |
| if "stars" in record and isinstance(record["stars"], (int, float)) and "text" in record: | |
| stars = record["stars"] | |
| text = record["text"] | |
| text_length = len(text) | |
| if stars not in star_counts: | |
| star_counts[stars] = 0 | |
| star_counts[stars] += 1 | |
| total_stars += stars | |
| total_reviews += 1 | |
| total_text_length += text_length | |
| if text_length < 10: | |
| short_text_stars += stars | |
| short_text_count += 1 | |
| if i < word_count_limit: | |
| words = text.lower().split() | |
| for word in words: | |
| word = ''.join(char for char in word if char.isalnum()) | |
| if word: | |
| if word not in word_frequencies: | |
| word_frequencies[word] = 0 | |
| word_frequencies[word] += 1 | |
| if "useful" in record and "funny" in record and "cool" in record: | |
| star_vote_totals[stars]['useful'] += record["useful"] | |
| star_vote_totals[stars]['funny'] += record["funny"] | |
| star_vote_totals[stars]['cool'] += record["cool"] | |
| star_vote_totals[stars]['count'] += 1 | |
| if total_reviews > 0: | |
| average_rating = total_stars / total_reviews | |
| average_text_length = total_text_length / total_reviews | |
| else: | |
| average_rating = 0 | |
| average_text_length = 0 | |
| if short_text_count > 0: | |
| average_short_text_rating = short_text_stars / short_text_count | |
| else: | |
| average_short_text_rating = 0 | |
| most_common_word = None | |
| most_common_count = 0 | |
| for word, count in word_frequencies.items(): | |
| if count > most_common_count: | |
| most_common_word = word | |
| most_common_count = count | |
| average_votes_by_star = {} | |
| for stars, votes in star_vote_totals.items(): | |
| if votes['count'] > 0: | |
| average_votes_by_star[stars] = { | |
| 'useful': votes['useful'] / votes['count'], | |
| 'funny': votes['funny'] / votes['count'], | |
| 'cool': votes['cool'] / votes['count'] | |
| } | |
| return star_counts, average_rating, average_text_length, average_short_text_rating, most_common_word, most_common_count, average_votes_by_star | |
| def plot_reviews_and_votes(star_counts, average_votes_by_star): | |
| star_ratings = [1, 2, 3, 4, 5] | |
| review_counts = [star_counts[star] for star in star_ratings] | |
| plt.figure(figsize=(10, 6)) | |
| plt.subplot(1, 2, 1) | |
| plt.bar(star_ratings, review_counts, color='blue') | |
| plt.title('Number of Reviews per Star Rating') | |
| plt.xlabel('Star Rating') | |
| plt.ylabel('Number of Reviews') | |
| useful_votes = [average_votes_by_star[star]['useful'] for star in star_ratings] | |
| funny_votes = [average_votes_by_star[star]['funny'] for star in star_ratings] | |
| cool_votes = [average_votes_by_star[star]['cool'] for star in star_ratings] | |
| plt.subplot(1, 2, 2) | |
| width = 0.2 | |
| x = [i - width for i in range(len(star_ratings))] | |
| plt.bar(x, useful_votes, width, label='Useful', color='green') | |
| plt.bar([i + width for i in range(len(star_ratings))], funny_votes, width, label='Funny', color='red') | |
| plt.bar([i + 2 * width for i in range(len(star_ratings))], cool_votes, width, label='Cool', color='blue') | |
| plt.title('Average Votes per Star Rating') | |
| plt.xlabel('Star Rating') | |
| plt.ylabel('Average Votes') | |
| plt.xticks(range(len(star_ratings)), star_ratings) | |
| plt.legend() | |
| plt.tight_layout() | |
| plt.show() | |
| if __name__ == "__main__": | |
| file_path = "yelp_academic_dataset_review.json" | |
| star_counts, average_rating, average_text_length, average_short_text_rating, most_common_word, most_common_count, average_votes_by_star = count_reviews_by_stars_and_average(file_path) | |
| for stars in sorted(star_counts): | |
| print(f"{stars} stars: {star_counts[stars]} reviews") | |
| print(f"Average rating: {average_rating:.2f}") | |
| print(f"Average text length: {average_text_length:.2f} characters") | |
| print(f"Average rating for reviews with text length < 10: {average_short_text_rating:.2f}") | |
| print(f"Most common word (in first 100,000 reviews): '{most_common_word}' (used {most_common_count} times)") | |
| print("Average votes per star rating:") | |
| for stars, votes in average_votes_by_star.items(): | |
| print(f"{stars} stars - Useful: {votes['useful']:.2f}, Funny: {votes['funny']:.2f}, Cool: {votes['cool']:.2f}") | |
| plot_reviews_and_votes(star_counts, average_votes_by_star) | |