From c38bc618fa6e502f6db75c1d9005e6068cb2fbae Mon Sep 17 00:00:00 2001 From: Julien Phalip Date: Mon, 6 Feb 2017 14:47:59 -0800 Subject: [PATCH] Initial commit --- .gitignore | 1 + README.md | 136 +++++++++++++++++++++++++++++++++++++++++++ code/__init__.py | 0 code/language_api.py | 82 ++++++++++++++++++++++++++ code/plotting.py | 136 +++++++++++++++++++++++++++++++++++++++++++ code/utils.py | 67 +++++++++++++++++++++ code/youtube_api.py | 65 +++++++++++++++++++++ 7 files changed, 487 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 code/__init__.py create mode 100644 code/language_api.py create mode 100644 code/plotting.py create mode 100644 code/utils.py create mode 100644 code/youtube_api.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1045803 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +google-api-key.txt \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..bf1f7a0 --- /dev/null +++ b/README.md @@ -0,0 +1,136 @@ +This repository contains all the code I wrote to support a case study published on my blog: + +The study aims to evaluate bias in the media using sentiment analysis of video titles published by some prominent +American TV channels on their Youtube accounts. + +Setup +===== + +A bit of setting up is required before you can run this code. + +Google API key +-------------- + +First, you need to get an API key from Google by following the steps described here: https://developers.google.com/api-client-library/python/guide/aaa_apikeys + +This key will be used for two services: + - Google Cloud Natural Language API + - YouTube Data API v3 + +Once you've acquired a key, save it into a file named `google-api-key.txt` at the root of this repository. + +Python environment +------------------ + +The following Python packages need to be installed in your Python environment: + + ipython==5.1.0 + pandas==0.19.1 + google-api-python-client==1.5.5 + unicodecsv==0.14.1 + +Acquiring data +============== + +Four types of datasets must be generated: channels, topics, videos and sentiment scores. + +Channels +-------- + +Create a `channels.csv` file using the structure detailed in this example: + +```python +channels = pandas.DataFrame.from_records([ + {'title': 'Fox News', 'slug': 'fox-news', 'youtube_id': 'UCXIJgqnII2ZOINSWNOGFThA', 'playlist_id': 'UUXIJgqnII2ZOINSWNOGFThA', 'url': 'https://www.youtube.com/user/FoxNewsChannel', 'color': '#5975a4'}, + {'title': 'CNN', 'slug': 'cnn', 'youtube_id': 'UCupvZG-5ko_eiXAupbDfxWw', 'playlist_id': 'UUupvZG-5ko_eiXAupbDfxWw', 'url': 'https://www.youtube.com/user/CNN', 'color': '#b55d60'}, + {'title': 'MSNBC', 'slug': 'msnbc', 'youtube_id': 'UCaXkIU1QidjPwiAYu6GcHjg', 'playlist_id': 'UUaXkIU1QidjPwiAYu6GcHjg', 'url': 'https://www.youtube.com/user/msnbcleanforward', 'color': '#5f9e6e'}, + {'title': 'CBS News', 'slug': 'cbs-news', 'youtube_id': 'UC8p1vwvWtl6T73JiExfWs1g', 'playlist_id': 'UU8p1vwvWtl6T73JiExfWs1g', 'url': 'https://www.youtube.com/user/CBSNewsOnline', 'color': '#666666'}, +]) + +channels.to_csv('channels.csv', index=False, encoding='utf-8') +``` + +The `youtube_id` is the channel's unique Youtube ID. Finding out a channel's ID is a little tricky: + +- Go to the channel's page (e.g. https://www.youtube.com/user/CNN) +- View the HTML source of the page. +- Look for "data-channel-external-id" in the HTML source. The value associated with it is the channel's Youtube ID. + +The `playlist_id` corresponds to a channel's default playlist where all its videos are published. To retrieve a channel's `playlist_id`: +- Visit this url after replacing "CHANNEL-ID" with the channel's ID: https://developers.google.com/apis-explorer/#search/youtube/youtube/v3/youtube.channels.list?part=contentDetails&id=CHANNEL-ID +- Click the "Execute without OAuth" link at the bottom of the page. +- The playlist ID is now presented in the field `items[0].contentDetails.relatedPlaylists.uploads` + +Topics +------ + +Create a `topics.csv` file using the structure detailed in this example: + +```python +topics = pandas.DataFrame.from_records([ + {'title': 'Obama', 'slug': 'obama', 'variant1': 'Obama', 'variant2': 'Obamas'}, + {'title': 'Clinton', 'slug': 'clinton','variant1': 'Clinton', 'variant2': 'Clintons'}, + {'title': 'Trump', 'slug': 'trump','variant1': 'Trump', 'variant2': 'Trumps'}, + {'title': 'Democrats', 'slug': 'democrats', 'variant1': 'Democrat', 'variant2': 'Democrats'}, + {'title': 'Republicans', 'slug': 'republicans', 'variant1': 'Republican', 'variant2': 'Republicans'}, + {'title': 'Liberals', 'slug': 'liberals', 'variant1': 'Liberal', 'variant2': 'Liberals'}, + {'title': 'Conservatives', 'slug': 'conservatives', 'variant1': 'Conservative', 'variant2': 'Conservatives'}, +]) + +topics.to_csv('topics.csv', index=False, encoding='utf-8') +``` + +The variants are the different terms that will be searched for in the video titles in order to match videos with your topics of choice. + +Videos +------ + +Run the following snippets of code in order to download all the video metadata from Youtube for your channels of choice: + +First, this will download all video information and create a separate CSV file for each channel (e.g. `videos-cnn.csv`): + +```python +from code.youtube_api import download_channels_videos + +download_channels_videos(channels) +``` + +Second, this will merge all the CSV files generated above into a single `videos-MERGED.csv` file. + +```python +from code.youtube_api import merge_channel_videos +merge_channel_videos(channels) +``` + +Lastly, this will create extra columns for each topic: + +```python +from code.utils import create_topic_columns + +videos = pd.read_csv('videos-MERGED.csv') +create_topic_columns(videos, topics) +videos.to_csv('videos.csv', index=False, encoding='utf-8') +``` + +You now have a `videos.csv` file containing all the video metadata for all channels. + +Sentiment scores +---------------- + +The last step is to download sentiment scores from the Google Natural Language API. **Note that this API is not free.** +Make sure to first refer to the API's [pricing page](https://cloud.google.com/natural-language/pricing) for adequate budgeting. + +Run the following: + +```python +from code.language_api import download_sentiments + +download_sentiments(videos) +``` + +You now have a `sentiments.csv` file containing the sentiment scores for all relevant videos. + +Exploring and analysing the data +================================ + +Coming soon... \ No newline at end of file diff --git a/code/__init__.py b/code/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/code/language_api.py b/code/language_api.py new file mode 100644 index 0000000..55bdbb7 --- /dev/null +++ b/code/language_api.py @@ -0,0 +1,82 @@ +import os +import time +import json +import unicodecsv as csv +from apiclient.discovery import build +from apiclient.errors import HttpError + + +API_KEY = open('google-api-key.txt', 'r').read() +language_service = build('language', 'v1', developerKey=API_KEY) + + +def analyze_sentiment(text): + """ + Sends a request to the Google Natural Language API to analyze + the sentiment of the given piece of text. + """ + request = language_service.documents().analyzeSentiment( + body={ + 'document': { + 'type': 'PLAIN_TEXT', + 'content': text, + } + }) + return request.execute() + + +def download_sentiments(videos, output_file='sentiments.csv'): + """ + Downloads sentiment scores from the Google Natural Language API + for the given videos, then stores the results in a CSV file. + """ + + # Time to wait when we get rate-limited + wait_time = 120 + + # Create new (or open existing) CSV file to hold the sentiment analysis values + if os.path.isfile(output_file): + # Open existing file in "append" mode + f = open(output_file, 'a') + writer = csv.writer(f, encoding='utf-8') + else: + # Open new file in "write" mode and add the headers + f = open(output_file, 'w') + writer = csv.writer(f, encoding='utf-8') + writer.writerow(['youtube_id', 'sentiment', 'sentiment_score', 'sentiment_magnitude']) + + i = 0 + n_videos = videos.shape[0] + print 'Start processing %s videos...' % n_videos + while i < n_videos: + video = videos.iloc[i] + try: + # Send request to the Google Natural Language API for the current video + sentiment = analyze_sentiment(video['title']) + # Add result to the CSV file + writer.writerow([ + video['youtube_id'], + json.dumps(sentiment), + sentiment['documentSentiment']['score'], + sentiment['documentSentiment']['magnitude'], + ]) + # Move on to the next video + i += 1 + except HttpError, e: + if e.resp.status == 429: + print 'Processed %s/%s videos so far...' % (i, n_videos) + # We got rate-limited, so wait a bit before trying again with the same video + time.sleep(wait_time) + elif e.resp.status == 400: + # Bad request. Probably something wrong with the video's text + error_content = json.loads(e.content)['error'] + print 'Error [%s] for video %s: %s' % ( + error_content['code'], video['youtube_id'], error_content['message']) + # Move on to the next video + i += 1 + else: + print "Unhandled error for video %s: %s" % ( + video['youtube_id'], video['title']) + raise + f.close() + print 'Finished processing %s videos.' % n_videos \ No newline at end of file diff --git a/code/plotting.py b/code/plotting.py new file mode 100644 index 0000000..4db5e8f --- /dev/null +++ b/code/plotting.py @@ -0,0 +1,136 @@ +from __future__ import division +import math +from datetime import datetime +import matplotlib.pyplot as plt +from matplotlib.patches import Patch +import seaborn as sns + + +def plot_channel_stats(stats, topics, channels, fig_height=8, y_center=False, title=None): + """ + Plots bar charts for the given channel stats. + A separate subplot is generated for each given topic. + """ + fig, axes = plt.subplots(nrows=int(math.ceil(topics.shape[0]/2)), ncols=2, figsize=(8,fig_height)) + fig.subplots_adjust(hspace=.5) + + for i, topic in topics.iterrows(): + ax = fig.axes[i] + + # If requested, center all axes around 0 + if y_center: + # Calculate the approximate amplitude of the given stats values + amplitude = math.ceil(stats.abs().values.max()*10)/10 + ax.set_ylim(-amplitude, amplitude) + + # If we have negative values, grey out the negative space for better contrast + if stats.values.min() < 0: + ax.axhspan(0, ax.get_ylim()[0], facecolor='0.2', alpha=0.15) + + color = channels.sort_values('title').color + ax.bar(range(len(stats.index)), stats[topic.slug], tick_label=stats.index, color=color, align='center') + ax.set_title(topic.title, size=11) + + # Hide potential last empty subplot + if topics.shape[0] % 2: + fig.axes[-1].axis('off') + + # Optional title at the top + if title is not None: + multiline = '\n' in title + y = 1. if multiline else .96 + plt.suptitle(title, size=14, y=y) + + plt.show() + + +def plot_compressed_channel_stats(stats, color=None, y_center=False, title=None): + """ + Similar to plot_channel_stats except everything is represented + in a single plot (i.e. no subplots). + """ + plt.figure(figsize=(6,4)) + ax = plt.gca() + + # If requested, center all axes around 0 + if y_center: + # Calculate the approximate amplitude of the given stats values + amplitude = math.ceil(stats.abs().values.max()*10)/10 + ax.set_ylim(-amplitude, amplitude) + + # If we have negative values, grey out the negative space + # for better contrast + if stats.values.min() < 0: + ax.axhspan(0, ax.get_ylim()[0], facecolor='0.2', alpha=0.15) + + # The actual plot + stats.plot(kind='bar', color=color, width=0.6, ax=ax) + + # Presentation cleanup + plt.xlabel('') + plt.xticks(rotation=0) + plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) + + # Optional title at the top + if title is not None: + plt.title(title) + + plt.show() + + +def plot_sentiment_series(videos, topics, channels, start_date=None, title=None): + """ + Plot linear timeseries of sentiment scores for the given videos: + One separate subplot is generated for each topic. Each subplot + has one timeseries for each channel, and one timeseries for the + average values across all channells. + """ + fig, axes = plt.subplots(nrows=topics.shape[0], ncols=1, figsize=(8,4*topics.shape[0])) + fig.subplots_adjust(hspace=.3) + + # Resample rule: 2-week buckets + resample_rule = '2W' + + # Calculate the approximate amplitude of the given sentiment values + amplitude = math.ceil(videos.sentiment_score.abs().max()*10)/10 + + for i, topic in topics.reset_index().iterrows(): + ax = fig.axes[i] + # Grey out the negative sentiment area + ax.axhspan(0, -1, facecolor='0.2', alpha=0.15) + + # Plot a timeseries for the average sentiment across all channels + topic_mask = videos[topic.slug] + if start_date is not None: + topic_mask = topic_mask & (videos.published_at >= start_date) + ts = videos[topic_mask].set_index('published_at').resample(resample_rule)['sentiment_score'].mean().interpolate() + sns.tsplot(ts, ts.index, color='#fcef99', linewidth=6, ax=ax) + + # Plot a separate time-series for each channel + for _, channel in channels.iterrows(): + channel_mask = topic_mask & (videos.channel==channel.title) + ts = videos[channel_mask].set_index('published_at').resample(resample_rule)['sentiment_score'].mean().interpolate() + if len(ts) > 1: + sns.tsplot(ts, ts.index, color=channel['color'], linewidth=1, ax=ax) + + # Format x-axis labels as dates + xvalues = ax.xaxis.get_majorticklocs() + xlabels = [datetime.utcfromtimestamp(x/1e9).strftime("%Y.%m") for x in xvalues] + ax.set_xticklabels(xlabels) + + # A little extra presentation cleanup + ax.set_xlabel('') + ax.set_title(topic['title'], size=11) + ax.set_ylim(-amplitude,amplitude) + + # Add legend + handles = [Patch(color='#fcef99', label='Average')] + for _, channel in channels.iterrows(): + handles.append(Patch(color=channel['color'], label=channel['title'])) + ax.legend(handles=handles, fontsize=8) + + # Optional title at the top + if title is not None: + plt.suptitle(title, size=14, y=.92) + + plt.show() \ No newline at end of file diff --git a/code/utils.py b/code/utils.py new file mode 100644 index 0000000..cf0fee7 --- /dev/null +++ b/code/utils.py @@ -0,0 +1,67 @@ +import re +import pandas as pd +from IPython.display import HTML + + +def show_videos(videos, ids, columns=None): + """ + Shows some basic information about the videos with + the given youtube IDs. Contrary to the default + pandas HTML representation, the information is never + truncated (no max_colwidth limits). + """ + if columns is None: + columns = ['title', 'sentiment_score', 'channel', 'published_at', 'youtube_id'] + with pd.option_context('display.max_colwidth', -1): + return HTML( + videos[videos.youtube_id.isin(ids)][columns].to_html(index=False) + ) + + +def get_variants(topic): + """ + Returns all variants for the given topic. + """ + return [topic['variant%s'%i] for i in range(1,3) if not pd.isnull(topic['variant%s'%i])] + + +def get_pattern(topic): + """ + Compiles and returns the regular expression pattern + matching all variants of the given topic. + """ + variants = get_variants(topic) + sub_patterns = [r'(.*\b)%s\b(.*)' % variant.lower() for variant in variants] + return re.compile(r'|'.join(sub_patterns), flags=re.IGNORECASE) + + +def is_relevant(video, topic_pattern): + """ + Returns True if the given topic is relevant to the given video. + """ + return bool(topic_pattern.match(video['title'])) + + +def create_topic_columns(videos, topics): + """ + Creates a separate column in the given `videos` dataframe + for each given topic. Those columns will contain `True` values + for videos that mention the corresponding topic. + Finally creates a `relevant` column that will contain `True` + for videos that mentions any topic at all. + """ + + # Clear values + videos['relevant'] = False + + # Create masks for each topic so we can later filter videos by topics + topic_masks = [] + for _, topic in topics.iterrows(): + videos[topic['slug']] = False # Clear values + pattern = get_pattern(topic) + topic_mask = videos.apply(lambda video: is_relevant(video, pattern), axis=1) + topic_masks.append(topic_mask) + videos[topic['slug']] = topic_mask + + # Mark video as 'relevant' if it mentions any of the topics + videos['relevant'] = np.any(np.column_stack(topic_masks), axis=1) diff --git a/code/youtube_api.py b/code/youtube_api.py new file mode 100644 index 0000000..e11cccd --- /dev/null +++ b/code/youtube_api.py @@ -0,0 +1,65 @@ +from IPython.display import clear_output +from apiclient.discovery import build + + +API_KEY = open('google-api-key.txt', 'r').read() +youtube_service = build('youtube', 'v3', developerKey=API_KEY) + + +def download_channel_videos(channel): + """ + Download metadata for all videos of the given channel + from the Youtube API. + """ + videos = [] + pageToken = None + while True: + response = youtube_service.playlistItems().list(playlistId=channel['playlist_id'], part="snippet", pageToken=pageToken).execute() + for video in response['items']: + videos.append({ + 'youtube_id': video['snippet']['resourceId']['videoId'], + 'title': video['snippet']['title'], + 'description': video['snippet']['description'], + 'published_at': video['snippet']['publishedAt'], + 'channel_youtube_id': channel['youtube_id'], + }) + pageToken = response.get("nextPageToken") + clear_output(wait=True) + print 'Downloading videos from "{}": {}...'.format(channel['title'], len(videos)) + if pageToken is None: + # There are no more videos to download + clear_output() + break + return videos + + +def download_channels_videos(channels): + """ + Download metadata for all videos of all the given channels, + then creates a separate CSV file (named videos-.csv) + with that information for each channel. + """ + for _, channel in channels.iterrows(): + videos = download_videos(channel) + df = pd.DataFrame.from_records(videos) + output_file = 'videos-{}.csv'.format(channel['slug']) + df.to_csv(output_file, index=False, encoding='utf-8') + print "Generated file: %s" % output_file + + +def merge_channel_videos(channels, output_file='videos-MERGED.csv'): + """ + Merge all videos-.csv files previously generated by + `download_channels_videos()` into a single videos-MERGED.csv file. + """ + # Merge all videos together + videos = [] + for _, channel in channels.iterrows(): + channel_videos = pd.read_csv('videos-%s.csv' % channel['slug']) + channel_videos['channel'] = channel['title'] + videos.append(channel_videos) + videos = pd.concat(videos, ignore_index=True) + videos['description'].fillna('', inplace=True) + videos.dropna(inplace=True) + videos.to_csv(output_file, index=False, encoding='utf-8') + print "Channel videos merged into %s" % output_file \ No newline at end of file