From c38bc618fa6e502f6db75c1d9005e6068cb2fbae Mon Sep 17 00:00:00 2001
From: Julien Phalip <jphalip@gmail.com>
Date: Mon, 6 Feb 2017 14:47:59 -0800
Subject: [PATCH] Initial commit

---
 .gitignore           |   1 +
 README.md            | 136 +++++++++++++++++++++++++++++++++++++++++++
 code/__init__.py     |   0
 code/language_api.py |  82 ++++++++++++++++++++++++++
 code/plotting.py     | 136 +++++++++++++++++++++++++++++++++++++++++++
 code/utils.py        |  67 +++++++++++++++++++++
 code/youtube_api.py  |  65 +++++++++++++++++++++
 7 files changed, 487 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.md
 create mode 100644 code/__init__.py
 create mode 100644 code/language_api.py
 create mode 100644 code/plotting.py
 create mode 100644 code/utils.py
 create mode 100644 code/youtube_api.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1045803
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+google-api-key.txt
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..bf1f7a0
--- /dev/null
+++ b/README.md
@@ -0,0 +1,136 @@
+This repository contains all the code I wrote to support a case study published on my blog: 
+
+The study aims to evaluate bias in the media using sentiment analysis of video titles published by some prominent
+American TV channels on their Youtube accounts.
+
+Setup
+=====
+
+A bit of setting up is required before you can run this code.
+
+Google API key
+--------------
+
+First, you need to get an API key from Google by following the steps described here: https://developers.google.com/api-client-library/python/guide/aaa_apikeys
+ 
+This key will be used for two services:
+  - Google Cloud Natural Language API
+  - YouTube Data API v3
+
+Once you've acquired a key, save it into a file named `google-api-key.txt` at the root of this repository.
+
+Python environment
+------------------
+
+The following Python packages need to be installed in your Python environment:
+
+	ipython==5.1.0
+	pandas==0.19.1
+	google-api-python-client==1.5.5
+	unicodecsv==0.14.1
+
+Acquiring data
+==============
+
+Four types of datasets must be generated: channels, topics, videos and sentiment scores.
+
+Channels
+--------
+
+Create a `channels.csv` file using the structure detailed in this example:
+
+```python
+channels = pandas.DataFrame.from_records([
+    {'title': 'Fox News', 'slug': 'fox-news', 'youtube_id': 'UCXIJgqnII2ZOINSWNOGFThA', 'playlist_id': 'UUXIJgqnII2ZOINSWNOGFThA', 'url': 'https://www.youtube.com/user/FoxNewsChannel', 'color': '#5975a4'},
+    {'title': 'CNN', 'slug': 'cnn', 'youtube_id': 'UCupvZG-5ko_eiXAupbDfxWw', 'playlist_id': 'UUupvZG-5ko_eiXAupbDfxWw', 'url': 'https://www.youtube.com/user/CNN', 'color': '#b55d60'},
+    {'title': 'MSNBC', 'slug': 'msnbc', 'youtube_id': 'UCaXkIU1QidjPwiAYu6GcHjg', 'playlist_id': 'UUaXkIU1QidjPwiAYu6GcHjg', 'url': 'https://www.youtube.com/user/msnbcleanforward', 'color': '#5f9e6e'},
+    {'title': 'CBS News', 'slug': 'cbs-news', 'youtube_id': 'UC8p1vwvWtl6T73JiExfWs1g', 'playlist_id': 'UU8p1vwvWtl6T73JiExfWs1g', 'url': 'https://www.youtube.com/user/CBSNewsOnline', 'color': '#666666'},
+])
+
+channels.to_csv('channels.csv', index=False, encoding='utf-8')
+```
+
+The `youtube_id` is the channel's unique Youtube ID. Finding out a channel's ID is a little tricky:
+
+- Go to the channel's page (e.g. https://www.youtube.com/user/CNN)
+- View the HTML source of the page.
+- Look for "data-channel-external-id" in the HTML source. The value associated with it is the channel's Youtube ID.
+
+The `playlist_id` corresponds to a channel's default playlist where all its videos are published. To retrieve a channel's `playlist_id`:
+- Visit this url after replacing "CHANNEL-ID" with the channel's ID: https://developers.google.com/apis-explorer/#search/youtube/youtube/v3/youtube.channels.list?part=contentDetails&id=CHANNEL-ID
+- Click the "Execute without OAuth" link at the bottom of the page.
+- The playlist ID is now presented in the field `items[0].contentDetails.relatedPlaylists.uploads`
+
+Topics
+------
+
+Create a `topics.csv` file using the structure detailed in this example:
+
+```python
+topics = pandas.DataFrame.from_records([
+    {'title': 'Obama', 'slug': 'obama', 'variant1': 'Obama', 'variant2': 'Obamas'},
+    {'title': 'Clinton', 'slug': 'clinton','variant1': 'Clinton', 'variant2': 'Clintons'},
+    {'title': 'Trump', 'slug': 'trump','variant1': 'Trump', 'variant2': 'Trumps'},
+    {'title': 'Democrats', 'slug': 'democrats', 'variant1': 'Democrat', 'variant2': 'Democrats'},
+    {'title': 'Republicans', 'slug': 'republicans', 'variant1': 'Republican', 'variant2': 'Republicans'},
+    {'title': 'Liberals', 'slug': 'liberals', 'variant1': 'Liberal', 'variant2': 'Liberals'},
+    {'title': 'Conservatives', 'slug': 'conservatives', 'variant1': 'Conservative', 'variant2': 'Conservatives'},
+])
+
+topics.to_csv('topics.csv', index=False, encoding='utf-8')
+```
+
+The variants are the different terms that will be searched for in the video titles in order to match videos with your topics of choice.
+
+Videos
+------
+
+Run the following snippets of code in order to download all the video metadata from Youtube for your channels of choice:
+
+First, this will download all video information and create a separate CSV file for each channel (e.g. `videos-cnn.csv`):
+
+```python
+from code.youtube_api import download_channels_videos
+
+download_channels_videos(channels)
+```
+
+Second, this will merge all the CSV files generated above into a single `videos-MERGED.csv` file.
+
+```python
+from code.youtube_api import merge_channel_videos
+merge_channel_videos(channels)
+```
+
+Lastly, this will create extra columns for each topic:
+
+```python
+from code.utils import create_topic_columns
+
+videos = pd.read_csv('videos-MERGED.csv')
+create_topic_columns(videos, topics)
+videos.to_csv('videos.csv', index=False, encoding='utf-8')
+```
+
+You now have a `videos.csv` file containing all the video metadata for all channels.
+
+Sentiment scores
+----------------
+
+The last step is to download sentiment scores from the Google Natural Language API. **Note that this API is not free.**
+Make sure to first refer to the API's [pricing page](https://cloud.google.com/natural-language/pricing) for adequate budgeting.
+
+Run the following:
+
+```python
+from code.language_api import download_sentiments
+
+download_sentiments(videos)
+```
+
+You now have a `sentiments.csv` file containing the sentiment scores for all relevant videos.
+
+Exploring and analysing the data
+================================
+
+Coming soon...
\ No newline at end of file
diff --git a/code/__init__.py b/code/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/code/language_api.py b/code/language_api.py
new file mode 100644
index 0000000..55bdbb7
--- /dev/null
+++ b/code/language_api.py
@@ -0,0 +1,82 @@
+import os
+import time
+import json
+import unicodecsv as csv
+from apiclient.discovery import build
+from apiclient.errors import HttpError
+
+
+API_KEY = open('google-api-key.txt', 'r').read()
+language_service = build('language', 'v1', developerKey=API_KEY)
+
+
+def analyze_sentiment(text):
+    """
+    Sends a request to the Google Natural Language API to analyze
+    the sentiment of the given piece of text.
+    """
+    request = language_service.documents().analyzeSentiment(
+      body={
+        'document': {
+          'type': 'PLAIN_TEXT',
+          'content': text,
+        }
+      })
+    return request.execute()
+
+
+def download_sentiments(videos, output_file='sentiments.csv'):
+    """
+    Downloads sentiment scores from the Google Natural Language API
+    for the given videos, then stores the results in a CSV file.
+    """
+
+    # Time to wait when we get rate-limited
+    wait_time = 120
+    
+    # Create new (or open existing) CSV file to hold the sentiment analysis values
+    if os.path.isfile(output_file):
+        # Open existing file in "append" mode
+        f = open(output_file, 'a')
+        writer = csv.writer(f, encoding='utf-8')
+    else:
+        # Open new file in "write" mode and add the headers
+        f = open(output_file, 'w')
+        writer = csv.writer(f, encoding='utf-8')
+        writer.writerow(['youtube_id', 'sentiment', 'sentiment_score', 'sentiment_magnitude'])
+
+    i = 0
+    n_videos = videos.shape[0]
+    print 'Start processing %s videos...' % n_videos
+    while i < n_videos:
+        video = videos.iloc[i]
+        try:
+            # Send request to the Google Natural Language API for the current video
+            sentiment = analyze_sentiment(video['title'])
+            # Add result to the CSV file
+            writer.writerow([
+                video['youtube_id'],
+                json.dumps(sentiment),
+                sentiment['documentSentiment']['score'],
+                sentiment['documentSentiment']['magnitude'],
+            ])   
+            # Move on to the next video
+            i += 1
+        except HttpError, e:
+            if e.resp.status == 429:
+                print 'Processed %s/%s videos so far...' % (i, n_videos)
+                # We got rate-limited, so wait a bit before trying again with the same video
+                time.sleep(wait_time)
+            elif e.resp.status == 400:
+                # Bad request. Probably something wrong with the video's text
+                error_content = json.loads(e.content)['error']
+                print 'Error [%s] for video %s: %s' % (
+                    error_content['code'], video['youtube_id'], error_content['message'])
+                # Move on to the next video
+                i += 1
+            else:
+                print "Unhandled error for video %s: %s" % (
+                    video['youtube_id'], video['title'])
+                raise
+    f.close()
+    print 'Finished processing %s videos.' % n_videos
\ No newline at end of file
diff --git a/code/plotting.py b/code/plotting.py
new file mode 100644
index 0000000..4db5e8f
--- /dev/null
+++ b/code/plotting.py
@@ -0,0 +1,136 @@
+from __future__ import division
+import math
+from datetime import datetime
+import matplotlib.pyplot as plt
+from matplotlib.patches import Patch
+import seaborn as sns
+
+
+def plot_channel_stats(stats, topics, channels, fig_height=8, y_center=False, title=None):
+    """
+    Plots bar charts for the given channel stats.
+    A separate subplot is generated for each given topic.
+    """
+    fig, axes = plt.subplots(nrows=int(math.ceil(topics.shape[0]/2)), ncols=2, figsize=(8,fig_height))
+    fig.subplots_adjust(hspace=.5)
+    
+    for i, topic in topics.iterrows():
+        ax = fig.axes[i]
+        
+        # If requested, center all axes around 0
+        if y_center:
+            # Calculate the approximate amplitude of the given stats values
+            amplitude = math.ceil(stats.abs().values.max()*10)/10
+            ax.set_ylim(-amplitude, amplitude)
+        
+        # If we have negative values, grey out the negative space for better contrast
+        if stats.values.min() < 0:
+            ax.axhspan(0, ax.get_ylim()[0], facecolor='0.2', alpha=0.15)
+        
+        color = channels.sort_values('title').color
+        ax.bar(range(len(stats.index)), stats[topic.slug], tick_label=stats.index, color=color, align='center')
+        ax.set_title(topic.title, size=11)
+        
+    # Hide potential last empty subplot
+    if topics.shape[0] % 2:
+        fig.axes[-1].axis('off')
+
+    # Optional title at the top
+    if title is not None:
+        multiline = '\n' in title
+        y = 1. if multiline else .96
+        plt.suptitle(title, size=14, y=y)
+        
+    plt.show()
+
+    
+def plot_compressed_channel_stats(stats, color=None, y_center=False, title=None):
+    """
+    Similar to plot_channel_stats except everything is represented
+    in a single plot (i.e. no subplots).
+    """
+    plt.figure(figsize=(6,4))
+    ax = plt.gca()
+    
+    # If requested, center all axes around 0
+    if y_center:
+        # Calculate the approximate amplitude of the given stats values
+        amplitude = math.ceil(stats.abs().values.max()*10)/10
+        ax.set_ylim(-amplitude, amplitude)
+
+    # If we have negative values, grey out the negative space
+    # for better contrast
+    if stats.values.min() < 0:
+        ax.axhspan(0, ax.get_ylim()[0], facecolor='0.2', alpha=0.15)
+        
+    # The actual plot
+    stats.plot(kind='bar', color=color, width=0.6, ax=ax)
+    
+    # Presentation cleanup
+    plt.xlabel('')
+    plt.xticks(rotation=0)
+    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
+
+    # Optional title at the top
+    if title is not None:
+        plt.title(title)
+
+    plt.show()
+    
+    
+def plot_sentiment_series(videos, topics, channels, start_date=None, title=None):
+    """
+    Plot linear timeseries of sentiment scores for the given videos:
+    One separate subplot is generated for each topic. Each subplot
+    has one timeseries for each channel, and one timeseries for the
+    average values across all channells.
+    """
+    fig, axes = plt.subplots(nrows=topics.shape[0], ncols=1, figsize=(8,4*topics.shape[0]))
+    fig.subplots_adjust(hspace=.3)
+    
+    # Resample rule: 2-week buckets
+    resample_rule = '2W'
+    
+    # Calculate the approximate amplitude of the given sentiment values
+    amplitude = math.ceil(videos.sentiment_score.abs().max()*10)/10
+    
+    for i, topic in topics.reset_index().iterrows():
+        ax = fig.axes[i]
+        # Grey out the negative sentiment area
+        ax.axhspan(0, -1, facecolor='0.2', alpha=0.15)
+
+        # Plot a timeseries for the average sentiment across all channels
+        topic_mask = videos[topic.slug]
+        if start_date is not None:
+            topic_mask = topic_mask & (videos.published_at >= start_date)
+        ts = videos[topic_mask].set_index('published_at').resample(resample_rule)['sentiment_score'].mean().interpolate()
+        sns.tsplot(ts, ts.index, color='#fcef99', linewidth=6, ax=ax)
+        
+        # Plot a separate time-series for each channel
+        for _, channel in channels.iterrows():
+            channel_mask = topic_mask & (videos.channel==channel.title)
+            ts = videos[channel_mask].set_index('published_at').resample(resample_rule)['sentiment_score'].mean().interpolate()
+            if len(ts) > 1:
+                sns.tsplot(ts, ts.index, color=channel['color'], linewidth=1, ax=ax)
+
+        # Format x-axis labels as dates
+        xvalues = ax.xaxis.get_majorticklocs()
+        xlabels = [datetime.utcfromtimestamp(x/1e9).strftime("%Y.%m") for x in xvalues]
+        ax.set_xticklabels(xlabels)
+
+        # A little extra presentation cleanup
+        ax.set_xlabel('')
+        ax.set_title(topic['title'], size=11)
+        ax.set_ylim(-amplitude,amplitude)
+
+        # Add legend
+        handles = [Patch(color='#fcef99', label='Average')]
+        for _, channel in channels.iterrows():
+            handles.append(Patch(color=channel['color'], label=channel['title']))
+        ax.legend(handles=handles, fontsize=8)
+
+    # Optional title at the top
+    if title is not None:
+        plt.suptitle(title, size=14, y=.92)
+        
+    plt.show()
\ No newline at end of file
diff --git a/code/utils.py b/code/utils.py
new file mode 100644
index 0000000..cf0fee7
--- /dev/null
+++ b/code/utils.py
@@ -0,0 +1,67 @@
+import re
+import pandas as pd
+from IPython.display import HTML
+
+
+def show_videos(videos, ids, columns=None):
+    """
+    Shows some basic information about the videos with
+    the given youtube IDs. Contrary to the default
+    pandas HTML representation, the information is never
+    truncated (no max_colwidth limits).
+    """
+    if columns is None:
+        columns = ['title', 'sentiment_score', 'channel', 'published_at', 'youtube_id']
+    with pd.option_context('display.max_colwidth', -1):
+        return HTML(
+            videos[videos.youtube_id.isin(ids)][columns].to_html(index=False)
+        )
+
+
+def get_variants(topic):
+    """
+    Returns all variants for the given topic.
+    """
+    return [topic['variant%s'%i] for i in range(1,3) if not pd.isnull(topic['variant%s'%i])]
+
+
+def get_pattern(topic):
+    """
+    Compiles and returns the regular expression pattern
+    matching all variants of the given topic.
+    """
+    variants = get_variants(topic)
+    sub_patterns = [r'(.*\b)%s\b(.*)' % variant.lower() for variant in variants]
+    return re.compile(r'|'.join(sub_patterns), flags=re.IGNORECASE)
+
+
+def is_relevant(video, topic_pattern):
+    """
+    Returns True if the given topic is relevant to the given video.
+    """
+    return bool(topic_pattern.match(video['title']))
+
+
+def create_topic_columns(videos, topics):
+    """
+    Creates a separate column in the given `videos` dataframe
+    for each given topic. Those columns will contain `True` values
+    for videos that mention the corresponding topic.
+    Finally creates a `relevant` column that will contain `True`
+    for videos that mentions any topic at all.
+    """
+    
+    # Clear values
+    videos['relevant'] = False
+
+    # Create masks for each topic so we can later filter videos by topics
+    topic_masks = []
+    for _, topic in topics.iterrows():
+        videos[topic['slug']] = False  # Clear values
+        pattern = get_pattern(topic)
+        topic_mask = videos.apply(lambda video: is_relevant(video, pattern), axis=1)
+        topic_masks.append(topic_mask)
+        videos[topic['slug']] = topic_mask
+
+    # Mark video as 'relevant' if it mentions any of the topics
+    videos['relevant'] = np.any(np.column_stack(topic_masks), axis=1)
diff --git a/code/youtube_api.py b/code/youtube_api.py
new file mode 100644
index 0000000..e11cccd
--- /dev/null
+++ b/code/youtube_api.py
@@ -0,0 +1,65 @@
+from IPython.display import clear_output
+from apiclient.discovery import build
+
+
+API_KEY = open('google-api-key.txt', 'r').read()
+youtube_service = build('youtube', 'v3', developerKey=API_KEY)
+
+
+def download_channel_videos(channel):
+    """
+    Download metadata for all videos of the given channel
+    from the Youtube API.
+    """
+    videos = []
+    pageToken = None
+    while True:
+        response = youtube_service.playlistItems().list(playlistId=channel['playlist_id'], part="snippet", pageToken=pageToken).execute()
+        for video in response['items']:
+            videos.append({
+                'youtube_id': video['snippet']['resourceId']['videoId'],
+                'title': video['snippet']['title'],
+                'description': video['snippet']['description'],
+                'published_at': video['snippet']['publishedAt'],
+                'channel_youtube_id': channel['youtube_id'],
+            })
+        pageToken = response.get("nextPageToken")
+        clear_output(wait=True)
+        print 'Downloading videos from "{}": {}...'.format(channel['title'], len(videos))
+        if pageToken is None:
+            # There are no more videos to download
+            clear_output()
+            break
+    return videos
+
+
+def download_channels_videos(channels):
+    """
+    Download metadata for all videos of all the given channels,
+    then creates a separate CSV file (named videos-<CHANNEL>.csv)
+    with that information for each channel.
+    """
+    for _, channel in channels.iterrows():
+        videos = download_videos(channel)
+        df = pd.DataFrame.from_records(videos)
+        output_file = 'videos-{}.csv'.format(channel['slug'])
+        df.to_csv(output_file, index=False, encoding='utf-8')
+        print "Generated file: %s" % output_file
+        
+
+def merge_channel_videos(channels, output_file='videos-MERGED.csv'):
+    """
+    Merge all videos-<CHANNEL>.csv files previously generated by
+    `download_channels_videos()` into a single videos-MERGED.csv file.
+    """
+    # Merge all videos together
+    videos = []
+    for _, channel in channels.iterrows():
+        channel_videos = pd.read_csv('videos-%s.csv' % channel['slug'])
+        channel_videos['channel'] = channel['title']
+        videos.append(channel_videos)
+    videos = pd.concat(videos, ignore_index=True)
+    videos['description'].fillna('', inplace=True)
+    videos.dropna(inplace=True)
+    videos.to_csv(output_file, index=False, encoding='utf-8')
+    print "Channel videos merged into %s" % output_file
\ No newline at end of file