Scrap top100 series from IMDb and plott some (date: 27.06.2017)

scraping through BeautifulSoup & IMDbPY, plotting with bokeh, and tqdm for sweet, smart progress bars

Zur besseren Lesbarkeit des jupyter notebooks Toggler-Buttons mittels jQuery einbauen
In [1]:
#@toggleInputDefault
from IPython.display import HTML
#Methode zum togglen(show/hide) markierter Inputs/Outputs mit jQuery(html,css)
HTML('''<script>
function toggler(){
    if(window.already_toggling){
        // Don't add multiple buttons. Needed for in notebokk use.
        return 0;
    }
    
    /*quick way if all inputs/outputs get a button
    let btnInput = $('.cell').prepend('<button>&#128065;Toggle this input</button>').children('button');
    let btnOutput = $('.output_wrapper').prepend('<button>Toggle this output</button>').children('button');
    */
    
    //add buttons to inputs/outputs with @toggleInput/@toggleOutput #comments
    let btnInput;
    let btnOutput;
    if ($('ipython-main-app') == true) {
        //#comment is class .cm-comment in notebook
        btnInput = $('..cm-comment:contains(@toggleInput)').closest('.cell')
                    .prepend('<button>Hide this input</button>').children('button');
        btnOutput = $('..cm-comment:contains(@toggleOutput)').closest('.cell').children('.output_wrapper')
                    .prepend('<button>Hide this output</button>').children('button');
    } else {
        //#comment is class .c1 in html!
        btnInput = $('.c1:contains(@toggleInput)').closest('.cell')
                    .prepend('<button>Hide this input</button>').children('button');
        btnOutput = $('.c1:contains(@toggleOutput)').closest('.cell').children('.output_wrapper')
                    .prepend('<button>Hide this output</button>').children('button');
    }
    
    //store alternative text to change/swap on toggle
    btnInput.data("altText", "Show this input");    
    btnOutput.data("altText", "Show this output");
    
    //style buttons
    btnInput.css({
        "background-color": "transparent",
        "color": "#303F9F",
        "border": "1px solid #303F9F",
        "margin-bottom": "5px"
    });
    btnOutput.css({
        "background-color": "transparent",
        "color": "#D84315",
        "border": "1px solid #D84315",
        "margin-top": "5px"
    });
    
    //add button hover functionality/style
    btnInput.hover(function(){
        $(this).css({"opacity": "0.5"});
    }, function(){
        $(this).css({"opacity": "1"});
    });
    btnOutput.hover(function(){
        $(this).css({"opacity": "0.5"});
    }, function(){
        $(this).css({"opacity": "1"});
    });
    
    //add button click functionality (toggle + textchange)
    btnInput.on('click', function(){
        $(this).siblings('.input').slideToggle();
        let oldText = $(this).text();
        $(this).text($(this).data("altText"));
        $(this).data("altText", oldText);
    })
    btnOutput.on('click', function(){
        $(this).siblings('.output').slideToggle();
        let oldText = $(this).text();
        $(this).text($(this).data("altText"));
        $(this).data("altText", oldText);
    })
    
    //hide inputs/outputs with #comments @toggleInputDefault/@toggleOutputDefault
    //#comment is class .cm-comment in notebook and .c1 in html! 
    //#@toggleInputDefault
    $('.cm-comment:contains(@toggleInputDefault)').closest('.cell')
        .children('button').trigger('click');
    $('.c1:contains(@toggleInputDefault)').closest('.cell')
        .children('button').trigger('click');
    //#@toggleOutputDefault
    $('.cm-comment:contains(@toggleOutputDefault)').closest('.cell').children('.output_wrapper')
        .children('button').trigger('click');
    $('.c1:contains(@toggleOutputDefault)').closest('.cell').children('.output_wrapper')
        .children('button').trigger('click');
    
    window.already_toggling = true;
}
$( document ).ready(toggler);
                 
</script>''')
Out[1]:
In [1]:
#@toggleInputDefault
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
from imdb import IMDb
from collections import OrderedDict
In [2]:
#@toggleInputDefault
r = requests.get('http://www.imdb.com/chart/tvmeter?ref_=nv_tvv_mptv_4')
c = r.content
soup = BeautifulSoup(c, 'html.parser')
In [3]:
#@toggleInputDefault
#Tabelle mit den Serien finden
table = soup.find('tbody', 'lister-list')

#alle Einträge erfassen
titleColumns = table.findAll('td', 'titleColumn')

Von IMDb Serien IDs abgreifen und in Dict füllen

In [4]:
#@toggleInputDefault
#@toggleOutputDefault
series_data = {}
imdb = IMDb()
rank = 1
#Einträge durchgehen und Dict mit den IDs füllen
for titleColumn in tqdm(titleColumns):
    #<a>-Tag im Eintrag finden
    a = titleColumn.find('a')
    #in der dort zu findenden URL die Stelle der ID herausfinden
    i = a['href'].find('tt')
    #ID herauskopieren
    iid = a['href'][i+2:i+9]
    m = imdb.get_movie(iid)
    #den Titel über die IMDb-Biblio nehmen, damit Original Titel und nicht deutscher von der Webseite
    title = m['title']
    #Serie mit Titel und ID in der gewünschten Struktur im Dict abspeichern
    dichelp = {}
    dichelp['id'] = iid.encode('ascii','ignore') #ID braucht kein U-Code, simpler String also
    dichelp['rank'] = rank
    #movieObject auch gleich abspeichern, um später nicht erneut aufrufen zu müssen!
    dichelp['movieObject'] = m
    rank += 1
    if title in series_data:#falls Title schon gehabt
        for name, series in list(series_data.iteritems()):
            if title == name:#Serie mit gleichem Title raussuchen
                if iid != series['id']:#prüfen ob das die gleiche Serie ist oder nicht
                    series_data[name+' ('+str(series['movieObject']['year'])+')'] = series_data.pop(name)
                    series_data[title+' ('+str(m['year'])+')'] = dichelp #falls nicht Namensvariationen benutzen ansonsten(gleiche Serie) ignorieren
    else:
        series_data[title] = dichelp
100%|██████████| 100/100 [08:37<00:00,  4.07s/it]
In [5]:
#@toggleInputDefault
#@toggleOutputDefault
series_data
Out[5]:
{u'13 Reasons Why': {'id': '1837492',
  'movieObject': <Movie id:1837492[http] title:_"13 Reasons Why" (2017)_>,
  'rank': 18},
 u'Agents of S.H.I.E.L.D.': {'id': '2364582',
  'movieObject': <Movie id:2364582[http] title:_"Agents of S.H.I.E.L.D." (2013)_>,
  'rank': 42},
 u'American Gods': {'id': '1898069',
  'movieObject': <Movie id:1898069[http] title:_"American Gods" (2017)_>,
  'rank': 4},
 u'American Horror Story': {'id': '1844624',
  'movieObject': <Movie id:1844624[http] title:_"American Horror Story" (2011)_>,
  'rank': 51},
 u'Animal Kingdom': {'id': '5574490',
  'movieObject': <Movie id:5574490[http] title:_"Animal Kingdom" (2016)_>,
  'rank': 53},
 u'Arrow': {'id': '2193021',
  'movieObject': <Movie id:2193021[http] title:_"Arrow" (2012)_>,
  'rank': 25},
 u'Better Call Saul': {'id': '3032476',
  'movieObject': <Movie id:3032476[http] title:_"Better Call Saul" (2015)_>,
  'rank': 29},
 u'Big Little Lies': {'id': '3920596',
  'movieObject': <Movie id:3920596[http] title:_"Big Little Lies" (2017)_>,
  'rank': 76},
 u'Black Mirror': {'id': '2085059',
  'movieObject': <Movie id:2085059[http] title:_"Black Mirror" (2011)_>,
  'rank': 37},
 u'Blood Drive': {'id': '4898282',
  'movieObject': <Movie id:4898282[http] title:_"Blood Drive" (2017)_>,
  'rank': 60},
 u'Bloodline': {'id': '3520702',
  'movieObject': <Movie id:3520702[http] title:_"Bloodline" (2015)_>,
  'rank': 68},
 u'Bones': {'id': '0460627',
  'movieObject': <Movie id:0460627[http] title:_"Bones" (2005)_>,
  'rank': 93},
 u'Breaking Bad': {'id': '0903747',
  'movieObject': <Movie id:0903747[http] title:_"Breaking Bad" (2008)_>,
  'rank': 34},
 u'Brooklyn Nine-Nine': {'id': '2467372',
  'movieObject': <Movie id:2467372[http] title:_"Brooklyn Nine-Nine" (2013)_>,
  'rank': 88},
 u'Claws': {'id': '5640558',
  'movieObject': <Movie id:5640558[http] title:_"Claws" (2017)_>,
  'rank': 96},
 u'Criminal Minds': {'id': '0452046',
  'movieObject': <Movie id:0452046[http] title:_"Criminal Minds" (2005)_>,
  'rank': 50},
 u'Dark Matter': {'id': '4159076',
  'movieObject': <Movie id:4159076[http] title:_"Dark Matter" (2015)_>,
  'rank': 75},
 u'Dexter': {'id': '0773262',
  'movieObject': <Movie id:0773262[http] title:_"Dexter" (2006)_>,
  'rank': 85},
 u'Doctor Who': {'id': '0436992',
  'movieObject': <Movie id:0436992[http] title:_"Doctor Who" (2005)_>,
  'rank': 19},
 u'El Chapo': {'id': '6692188',
  'movieObject': <Movie id:6692188[http] title:_"El Chapo" (2017)_>,
  'rank': 46},
 u'Family Guy': {'id': '0182576',
  'movieObject': <Movie id:0182576[http] title:_"Family Guy" (1999)_>,
  'rank': 81},
 u'Fargo': {'id': '2802850',
  'movieObject': <Movie id:2802850[http] title:_"Fargo" (2014)_>,
  'rank': 11},
 u'Fear the Walking Dead': {'id': '3743822',
  'movieObject': <Movie id:3743822[http] title:_"Fear the Walking Dead" (2015)_>,
  'rank': 26},
 u'Friends': {'id': '0108778',
  'movieObject': <Movie id:0108778[http] title:_"Friends" (1994)_>,
  'rank': 32},
 u'GLOW': {'id': '5770786',
  'movieObject': <Movie id:5770786[http] title:_"GLOW" (2017)_>,
  'rank': 2},
 u'GLOW: Gorgeous Ladies of Wrestling': {'id': '0252462',
  'movieObject': <Movie id:0252462[http] title:_"GLOW: Gorgeous Ladies of Wrestling" (1986)_>,
  'rank': 73},
 u'Game of Thrones': {'id': '0944947',
  'movieObject': <Movie id:0944947[http] title:_"Game of Thrones" (2011)_>,
  'rank': 1},
 u'Genius': {'id': '5673782',
  'movieObject': <Movie id:5673782[http] title:_"Genius" (2017)_>,
  'rank': 59},
 u'Gotham': {'id': '3749900',
  'movieObject': <Movie id:3749900[http] title:_"Gotham" (2014)_>,
  'rank': 49},
 u"Grey's Anatomy": {'id': '0413573',
  'movieObject': <Movie id:0413573[http] title:_"Grey's Anatomy" (2005)_>,
  'rank': 31},
 u'Gypsy': {'id': '5503718',
  'movieObject': <Movie id:5503718[http] title:_"Gypsy" (2017)_>,
  'rank': 64},
 u'Hawaii Five-0': {'id': '1600194',
  'movieObject': <Movie id:1600194[http] title:_"Hawaii Five-0" (2010)_>,
  'rank': 97},
 u'Homeland': {'id': '1796960',
  'movieObject': <Movie id:1796960[http] title:_"Homeland" (2011)_>,
  'rank': 74},
 u'House of Cards': {'id': '1856010',
  'movieObject': <Movie id:1856010[http] title:_"House of Cards" (2013)_>,
  'rank': 12},
 u'How I Met Your Mother': {'id': '0460649',
  'movieObject': <Movie id:0460649[http] title:_"How I Met Your Mother" (2005)_>,
  'rank': 55},
 u'Inhumans': {'id': '4154858',
  'movieObject': <Movie id:4154858[http] title:_"Inhumans" (2017)_>,
  'rank': 15},
 u"It's Always Sunny in Philadelphia": {'id': '0472954',
  'movieObject': <Movie id:0472954[http] title:_"It's Always Sunny in Philadelphia" (2005)_>,
  'rank': 98},
 u'Law & Order: Special Victims Unit': {'id': '0203259',
  'movieObject': <Movie id:0203259[http] title:_"Law & Order: Special Victims Unit" (1999)_>,
  'rank': 79},
 u'Lost': {'id': '0411008',
  'movieObject': <Movie id:0411008[http] title:_"Lost" (2004)_>,
  'rank': 84},
 u'Lucifer': {'id': '4052886',
  'movieObject': <Movie id:4052886[http] title:_"Lucifer" (2015)_>,
  'rank': 71},
 u'Mad Men': {'id': '0804503',
  'movieObject': <Movie id:0804503[http] title:_"Mad Men" (2007)_>,
  'rank': 90},
 u'Modern Family': {'id': '1442437',
  'movieObject': <Movie id:1442437[http] title:_"Modern Family" (2009)_>,
  'rank': 48},
 u'NCIS: Naval Criminal Investigative Service': {'id': '0364845',
  'movieObject': <Movie id:0364845[http] title:_"NCIS: Naval Criminal Investigative Service" (2003)_>,
  'rank': 58},
 u'Narcos': {'id': '2707408',
  'movieObject': <Movie id:2707408[http] title:_"Narcos" (2015)_>,
  'rank': 70},
 u'Once Upon a Time': {'id': '1843230',
  'movieObject': <Movie id:1843230[http] title:_"Once Upon a Time" (2011)_>,
  'rank': 66},
 u'Orange Is the New Black': {'id': '2372162',
  'movieObject': <Movie id:2372162[http] title:_"Orange Is the New Black" (2013)_>,
  'rank': 3},
 u'Orphan Black': {'id': '2234222',
  'movieObject': <Movie id:2234222[http] title:_"Orphan Black" (2013)_>,
  'rank': 72},
 u'Outlander': {'id': '3006802',
  'movieObject': <Movie id:3006802[http] title:_"Outlander" (2014)_>,
  'rank': 45},
 u'Parks and Recreation': {'id': '1266020',
  'movieObject': <Movie id:1266020[http] title:_"Parks and Recreation" (2009)_>,
  'rank': 87},
 u'Peaky Blinders': {'id': '2442560',
  'movieObject': <Movie id:2442560[http] title:_"Peaky Blinders" (2013)_>,
  'rank': 77},
 u'Poldark': {'id': '3636060',
  'movieObject': <Movie id:3636060[http] title:_"Poldark" (2015)_>,
  'rank': 95},
 u'Power': {'id': '3281796',
  'movieObject': <Movie id:3281796[http] title:_"Power" (I) (2014)_>,
  'rank': 8},
 u'Preacher': {'id': '5016504',
  'movieObject': <Movie id:5016504[http] title:_"Preacher" (2016)_>,
  'rank': 7},
 u'Pretty Little Liars': {'id': '1578873',
  'movieObject': <Movie id:1578873[http] title:_"Pretty Little Liars" (2010)_>,
  'rank': 5},
 u'Prison Break': {'id': '0455275',
  'movieObject': <Movie id:0455275[http] title:_"Prison Break" (2005)_>,
  'rank': 35},
 u'Quantico': {'id': '4428122',
  'movieObject': <Movie id:4428122[http] title:_"Quantico" (2015)_>,
  'rank': 92},
 u'Queen of the South': {'id': '1064899',
  'movieObject': <Movie id:1064899[http] title:_"Queen of the South" (2016)_>,
  'rank': 80},
 u'Reign': {'id': '2710394',
  'movieObject': <Movie id:2710394[http] title:_"Reign" (2013)_>,
  'rank': 54},
 u'Rick and Morty': {'id': '2861424',
  'movieObject': <Movie id:2861424[http] title:_"Rick and Morty" (2013)_>,
  'rank': 39},
 u'Riverdale': {'id': '5420376',
  'movieObject': <Movie id:5420376[http] title:_"Riverdale" (2017)_>,
  'rank': 17},
 u'Riviera': {'id': '5936448',
  'movieObject': <Movie id:5936448[http] title:_"Riviera" (2017)_>,
  'rank': 20},
 u'Sense8': {'id': '2431438',
  'movieObject': <Movie id:2431438[http] title:_"Sense8" (2015)_>,
  'rank': 56},
 u'Shadowhunters': {'id': '4145054',
  'movieObject': <Movie id:4145054[http] title:_"Shadowhunters" (2016)_>,
  'rank': 67},
 u'Shameless': {'id': '1586680',
  'movieObject': <Movie id:1586680[http] title:_"Shameless" (2011)_>,
  'rank': 9},
 u'Sherlock': {'id': '1475582',
  'movieObject': <Movie id:1475582[http] title:_"Sherlock" (2010)_>,
  'rank': 57},
 u'Shooter': {'id': '4181172',
  'movieObject': <Movie id:4181172[http] title:_"Shooter" (2016)_>,
  'rank': 86},
 u'Silicon Valley': {'id': '2575988',
  'movieObject': <Movie id:2575988[http] title:_"Silicon Valley" (2014)_>,
  'rank': 21},
 u'Sons of Anarchy': {'id': '1124373',
  'movieObject': <Movie id:1124373[http] title:_"Sons of Anarchy" (2008)_>,
  'rank': 61},
 u'Stranger Things': {'id': '4574334',
  'movieObject': <Movie id:4574334[http] title:_"Stranger Things" (2016)_>,
  'rank': 41},
 u'Suits': {'id': '1632701',
  'movieObject': <Movie id:1632701[http] title:_"Suits" (2011)_>,
  'rank': 33},
 u'Supergirl': {'id': '4016454',
  'movieObject': <Movie id:4016454[http] title:_"Supergirl" (2015)_>,
  'rank': 44},
 u'Supernatural': {'id': '0460681',
  'movieObject': <Movie id:0460681[http] title:_"Supernatural" (2005)_>,
  'rank': 27},
 u'Teen Wolf': {'id': '1567432',
  'movieObject': <Movie id:1567432[http] title:_"Teen Wolf" (2011)_>,
  'rank': 82},
 u"That '70s Show": {'id': '0165598',
  'movieObject': <Movie id:0165598[http] title:_"That '70s Show" (1998)_>,
  'rank': 78},
 u'The 100': {'id': '2661044',
  'movieObject': <Movie id:2661044[http] title:_"The 100" (2014)_>,
  'rank': 23},
 u'The Americans': {'id': '2149175',
  'movieObject': <Movie id:2149175[http] title:_"The Americans" (2013)_>,
  'rank': 100},
 u'The Big Bang Theory': {'id': '0898266',
  'movieObject': <Movie id:0898266[http] title:_"The Big Bang Theory" (2007)_>,
  'rank': 14},
 u'The Blacklist': {'id': '2741602',
  'movieObject': <Movie id:2741602[http] title:_"The Blacklist" (2013)_>,
  'rank': 52},
 u'The Flash': {'id': '3107288',
  'movieObject': <Movie id:3107288[http] title:_"The Flash" (2014)_>,
  'rank': 16},
 u"The Handmaid's Tale": {'id': '5834204',
  'movieObject': <Movie id:5834204[http] title:_"The Handmaid's Tale" (2017)_>,
  'rank': 6},
 u'The Last Kingdom': {'id': '4179452',
  'movieObject': <Movie id:4179452[http] title:_"The Last Kingdom" (2015)_>,
  'rank': 91},
 u'The Leftovers': {'id': '2699128',
  'movieObject': <Movie id:2699128[http] title:_"The Leftovers" (2014)_>,
  'rank': 62},
 u'The Mist': {'id': '5639976',
  'movieObject': <Movie id:5639976[http] title:_"The Mist" (2017)_>,
  'rank': 13},
 u'The Night Shift': {'id': '2477230',
  'movieObject': <Movie id:2477230[http] title:_"The Night Shift" (2014)_>,
  'rank': 83},
 u'The Office': {'id': '0386676',
  'movieObject': <Movie id:0386676[http] title:_"The Office" (2005)_>,
  'rank': 40},
 u'The Originals': {'id': '2632424',
  'movieObject': <Movie id:2632424[http] title:_"The Originals" (2013)_>,
  'rank': 22},
 u'The Ranch': {'id': '4998212',
  'movieObject': <Movie id:4998212[http] title:_"The Ranch" (2016)_>,
  'rank': 28},
 u'The Simpsons': {'id': '0096697',
  'movieObject': <Movie id:0096697[http] title:_"The Simpsons" (1989)_>,
  'rank': 69},
 u'The Vampire Diaries': {'id': '1405406',
  'movieObject': <Movie id:1405406[http] title:_"The Vampire Diaries" (2009)_>,
  'rank': 47},
 u'The Walking Dead': {'id': '1520211',
  'movieObject': <Movie id:1520211[http] title:_"The Walking Dead" (2010)_>,
  'rank': 24},
 u'This Is Us': {'id': '5555260',
  'movieObject': <Movie id:5555260[http] title:_"This Is Us" (2016)_>,
  'rank': 99},
 u'True Detective': {'id': '2356777',
  'movieObject': <Movie id:2356777[http] title:_"True Detective" (2014)_>,
  'rank': 94},
 u'Twin Peaks (1990)': {'id': '0098936',
  'movieObject': <Movie id:0098936[http] title:_"Twin Peaks" (1990)_>,
  'rank': 36},
 u'Twin Peaks (2017)': {'id': '4093826',
  'movieObject': <Movie id:4093826[http] title:_"Twin Peaks" (2017)_>,
  'rank': 10},
 u'Vikings': {'id': '2306299',
  'movieObject': <Movie id:2306299[http] title:_"Vikings" (2013)_>,
  'rank': 30},
 u'Wentworth': {'id': '2433738',
  'movieObject': <Movie id:2433738[http] title:_"Wentworth" (2013)_>,
  'rank': 63},
 u'Westworld': {'id': '0475784',
  'movieObject': <Movie id:0475784[http] title:_"Westworld" (2016)_>,
  'rank': 43},
 u'Younger': {'id': '3288518',
  'movieObject': <Movie id:3288518[http] title:_"Younger" (2015)_>,
  'rank': 65},
 u'Zoo': {'id': '3250026',
  'movieObject': <Movie id:3250026[http] title:_"Zoo" (2015)_>,
  'rank': 89},
 u'iZombie': {'id': '3501584',
  'movieObject': <Movie id:3501584[http] title:_"iZombie" (2015)_>,
  'rank': 38}}

noch weitere Infos über das bereits gespeicherte movieObject von IMDb einholen

In [6]:
#@toggleInputDefault
#@toggleOutputDefault
for series_name,series in tqdm(list(series_data.iteritems())):
    
    if not 'movieObject' in series:
        series['movieObject'] = imdb.get_movie(series['id'])
    
    if not 'episodes_total' in series:
        #mehr Informationen einholen
        m = series['movieObject']
        #versuche Rating zu finden
        try:
            series['rating_series'] = m['rating']
        except KeyError:
            print(m, 'hat kein Rating?, update vote details!')
            imdb.update(m, 'vote details')
            try:
                series['rating_series'] = m['rating']
            except KeyError:
                print(m, 'keine Hoffnung...')
                series['rating_series'] = 0.5
        
        imdb.update(m, 'episodes')
        #speicher die Staffelanzahl, falls vorhanden
        try:
            series['seasons'] = m['number of seasons']
        except KeyError:
            series['seasons'] = 0
        seasons_episodes_structure = {}
        episodes_total_counter = 0
        for season in m['episodes']:
            episodes_season_counter = 0
            for j in m['episodes'][season]:
                episodes_season_counter += 1
            #print counter, "episoden in Staffel ",season
            #speicher Episodenanzahl dieser Staffel
            seasons_episodes_structure[season] = episodes_season_counter
            episodes_total_counter += episodes_season_counter
        #speicher die Episodenanzahl der ganzen Serie
        series['episodes_total'] = episodes_total_counter #evtl. besser m['number of episodes']
        series['seasons_episodes_structure'] = seasons_episodes_structure
        
        
#jetzt Aufbau ist = {'series_name': series{'id', 'rating_series', 'episodes_total', 'seasons', 
#'seasons_episodes_structure': {staffelnummer: episodenanzahl, staffelX-mal}}}
  3%|▎         | 3/100 [00:21<13:05,  8.09s/it]
(<Movie id:5555260[http] title:_"This Is Us" (2016)_>, 'hat kein Rating?, update vote details!')
 20%|██        | 20/100 [02:50<10:05,  7.57s/it]
(<Movie id:5770786[http] title:_"GLOW" (2017)_>, 'hat kein Rating?, update vote details!')
 47%|████▋     | 47/100 [06:24<05:45,  6.52s/it]
(<Movie id:3920596[http] title:_"Big Little Lies" (2017)_>, 'hat kein Rating?, update vote details!')
 52%|█████▏    | 52/100 [07:01<05:22,  6.72s/it]
(<Movie id:5673782[http] title:_"Genius" (2017)_>, 'hat kein Rating?, update vote details!')
 57%|█████▋    | 57/100 [08:01<08:08, 11.37s/it]
(<Movie id:4093826[http] title:_"Twin Peaks" (2017)_>, 'hat kein Rating?, update vote details!')
 63%|██████▎   | 63/100 [08:41<03:34,  5.80s/it]
(<Movie id:4898282[http] title:_"Blood Drive" (2017)_>, 'hat kein Rating?, update vote details!')
 86%|████████▌ | 86/100 [11:31<01:35,  6.83s/it]
(<Movie id:4181172[http] title:_"Shooter" (2016)_>, 'hat kein Rating?, update vote details!')
 87%|████████▋ | 87/100 [11:34<01:13,  5.69s/it]
(<Movie id:0475784[http] title:_"Westworld" (2016)_>, 'hat kein Rating?, update vote details!')
 91%|█████████ | 91/100 [11:56<00:46,  5.12s/it]
(<Movie id:4154858[http] title:_"Inhumans" (2017)_>, 'hat kein Rating?, update vote details!')
(<Movie id:4154858[http] title:_"Inhumans" (2017)_>, 'keine Hoffnung...')
100%|██████████| 100/100 [13:03<00:00,  7.08s/it]

Starting with bokeh, plotting and stuff

In [7]:
#@toggleInputDefault
#@toggleOutputDefault
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.io import output_notebook
output_notebook() #damit figure im notebokk angezeigt wird
from bokeh.models import CustomJS, HoverTool, LinearColorMapper, ColorBar, Renderer, Label, LabelSet, FixedTicker, Range1d, Span, BoxAnnotation
from bokeh.models.glyphs import HBar
from bokeh.charts import Bar
from bokeh.layouts import layout
from bokeh.palettes import viridis, inferno
from bokeh.models.widgets import Toggle

import ipywidgets as widgets
from IPython.display import display
Loading BokehJS ...
E:\Programme\Anaconda3\envs\python2\lib\site-packages\bokeh\util\deprecation.py:34: BokehDeprecationWarning: 
The bokeh.charts API has moved to a separate 'bkcharts' package.

This compatibility shim will remain until Bokeh 1.0 is released.
After that, if you want to use this API you will have to install
the bkcharts package explicitly.

  warn(message)

Methode zum Übergeben einer einzelnen Serie und der Erfassung ihrer Episoden (usw)

In [8]:
#@toggleInputDefault
def get_data_serie_plot(iid):
    print 'Get IMDbs Episodes Information for Serie',iid
    plot_serie = {}
    #finde die Serie und ihre bisherigen Daten in dem Serien-Dict
    for name, serie in list(series_data.iteritems()):
        if serie['id'] == iid:
            plot_serie[name] = serie
            break
    #füge der Serie die Episoden-Daten hinzu
    for name, serie in list(plot_serie.iteritems()):
        serie['episodes'] = {}
        m = serie['movieObject']
        average_episode_rating = 0
        rated_episodes = 0
        for season in tqdm(m['episodes']):
            serie['episodes'][season] = {}
            for episode in m['episodes'][season]:
                serie['episodes'][season][episode] = {}
                e = m['episodes'][season][episode]
                imdb.update(e, 'main')
                #MovieObject der Episode abspeichern
                serie['episodes'][season][episode]['episode_object'] = e
                imdb.update(e, 'vote details')
                #Rating der Episode
                try:
                    serie['episodes'][season][episode]['episode_rating'] = e.get('rating')
                except KeyError:
                    serie['episodes'][season][episode]['episode_rating'] = None #0
                #print season, episode, serie['episodes'][season][episode]['episode_rating']
                #Fälle wo Episoden noch kein Rating (da sie nur angekündigt, aber noch nicht erschienen sind) abfangen
                if serie['episodes'][season][episode]['episode_rating'] != None:
                    rated_episodes += 1
                    average_episode_rating += serie['episodes'][season][episode]['episode_rating']
                #Titel der Episode
                serie['episodes'][season][episode]['episode_title'] = e.get('title')
        #Durchschnitts-Rating aller gerateden Episoden
        if rated_episodes != 0:
            serie['average_episode_rating'] = average_episode_rating/rated_episodes
        else:
            serie['average_episode_rating'] = 0.5
        #Differenz zwischen des Serien- und des Episodendurschnittsratings
        serie['difference_average_episode_to_serie_rating'] = round(serie['average_episode_rating']-serie['rating_series'], 2)
        #Anzahl der gerateden Episoden
        serie['rated_episodes'] = rated_episodes
        
        #füge die Daten auch gleich in das große Serien Dict
        for na, se in list(series_data.iteritems()):
            if se['id'] == iid:
                se['episodes'] = serie['episodes'] 
                
    #gib das Dict, das nur die gewünschte Serien mit ihren Daten erhält zurück
    return plot_serie
print 'definiert'
definiert

Methode für Serien-Episoden-Chart

In [9]:
#@toggleInputDefault
#Methode für Serien-Episode-Plot
def serie_episodes_figure(requestedSerie_id):
    
    #gewünschte Serie erhalten
    serie_detailed = get_data_serie_plot(requestedSerie_id)
    
    #Struktur der Daten für Chart definieren
    source = ColumnDataSource(data=dict(
        episodes_ratings = [],
        episodes_titles = [],
        episodes_numbers = [],
        episodes_seasonic = [],
        season = [],
        season_episode_numeration = []
    ))
    
    #passende/benötigte Daten aus Serien-Dict an Source für Chart übergeben
    for name, serie in list(serie_detailed.iteritems()):
        average_episode_rating = 0
        episode_counter = 0
        min_rating = 10
        max_rating = 0
        serie_title = name
        serie_rating = serie['rating_series']
        serie_structure = serie['seasons_episodes_structure']
        season_tracker = 0
        episodes_in_seasonic = []
        in_season_episode_numeration = []
        for season in serie['episodes']:
            season_tracker = season
            for episodes in serie['episodes'][season]:
                ep_rat = serie['episodes'][season][episodes]['episode_rating']
                source.data['episodes_ratings'].append(ep_rat)
                #SEpisoden ohne Rating keinen Einfluss auf min und max für Layout natürlich
                if ep_rat != None:
                    if ep_rat < min_rating:
                        min_rating = ep_rat
                    if ep_rat > max_rating:
                        max_rating = ep_rat
                source.data['episodes_titles'].append(serie['episodes'][season][episodes]['episode_title'])
                episode_counter +=1
                source.data['episodes_numbers'].append(episode_counter)
                source.data['episodes_seasonic'].append(episodes)
                episodes_in_seasonic.append(str(episodes))
                source.data['season'].append(season_tracker)
                source.data['season_episode_numeration'].append(str(season)+'.'+str(episodes))
                in_season_episode_numeration.append(str(season)+'.'+str(episodes))
        tot_episodes = serie['episodes_total']       
        average_episode_rating = round(serie['average_episode_rating'], 2)

    #definieren der Ranges des Plots mit Grenzen, damit man nicht zuweit oder zunah zoomt und scrollt
    if episode_counter <= 100:
        x_start = episode_counter
    else:
        x_start = 100
    x_range = Range1d(0, x_start, bounds = (0, episode_counter+2), min_interval = 10, max_interval = episode_counter+2)
    y_range = Range1d(min_rating-1, max_rating+1, bounds = (0, 11), min_interval = 1, max_interval = 10)

    p = figure(x_range = x_range, y_range = y_range, 
               tools = ['hover', 'tap', 'xwheel_pan', 'xwheel_zoom', 'wheel_zoom', 'pan', 'resize', 'reset'], 
               active_scroll = 'xwheel_zoom', title = serie_title, x_axis_location = 'below', plot_width = 900)
    p.title.text_font_size = '18pt'
    hover = p.select_one(HoverTool)

    #Definieren der Tooltips beim Hovern
    hover.tooltips = [('Episode Title', '@episodes_titles'), ('Episode Rating', '@episodes_ratings{1.1}'), 
                      ('Season.Episode', '@season'+'.'+'@episodes_seasonic')]

    #Line- und Kreis-Chart aus den Daten generieren
    p.line('episodes_numbers', 'episodes_ratings', line_width = 2, source = source)
    p.circle('episodes_numbers', 'episodes_ratings', size = 7, source = source)

    #Grids anpassen
    p.ygrid.band_fill_alpha = 0.1
    p.ygrid.band_fill_color = 'green'
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None
    p.ygrid.ticker = FixedTicker(ticks = range(0, 11, 1))

    #Achsen benennen und deren Nummerierung anpassen
    p.xaxis.axis_label = 'Episodes'
    p.yaxis.axis_label = 'IMDb Rating'
    p.yaxis.ticker = FixedTicker(ticks = range(0, 11, 1))
    p.yaxis.major_tick_line_color = None
    p.xaxis.axis_line_color = None
    p.yaxis.axis_line_color = None

    #unötige/störende Outline weg
    p.outline_line_color = None

    #Annotations für die Staffel
    helper = 1
    for season, episodes in list(serie_structure.iteritems()):
        left = helper
        right = left + episodes -1
        helper = right + 1
        box = BoxAnnotation(left = left, right = right, fill_alpha = 0.1, fill_color = 'red', line_color = None)
        p.add_layout(box)
        label = Label(x = left+((right-left)/2), y = min_rating-0.3, text = 'Season '+str(season), 
                  text_baseline = 'hanging', text_align = 'center')
        p.add_layout(label)

    #Annotations für Episode-Average-Rating- und Serien-Rating-Linie
    ep_av_ra_line = Span(location = average_episode_rating, dimension = 'width', visible = False,
                         line_color = 'green', line_dash = 'dashed', line_width = 2)
    p.add_layout(ep_av_ra_line)

    se_ra_line = Span(location = serie_rating, dimension = 'width', visible = False,
                      line_color = 'red', line_dash='dashed', line_width = 2)
    p.add_layout(se_ra_line)

    #simple Toggle-Knöpfe zum Steuern der Anzeigen der Episode-Average-Rating- und der Serien-Rating-Linie
    code = '''object.visible = toggle.active'''

    callback1 = CustomJS.from_coffeescript(code = code, args = {})
    toggle_ep_av_ra = Toggle(label = 'Average Episode Rating ('+str(average_episode_rating)+')', button_type = 'success', 
                             callback = callback1, active = False)
    callback1.args = {'toggle': toggle_ep_av_ra, 'object': ep_av_ra_line}

    callback2 = CustomJS.from_coffeescript(code = code, args = {})
    toggle_se_ra = Toggle(label = ('Serie Rating ('+str(serie_rating)+')'), button_type = 'success', 
                          callback = callback2, active = False)
    callback2.args = {'toggle': toggle_se_ra, 'object': se_ra_line}

    print 'requested figure completed'
    return layout([p], [toggle_ep_av_ra, toggle_se_ra])
print 'definiert'
definiert

Methode für Vergleich-Charts der Rating-Differenzen

In [10]:
#@toggleInputDefault
#Methode für 2 Charts zum Vergleich der Rating-Differenzen
def rating_difference_comparison_figure():
    #color_mapper = LinearColorMapper(low = -10, high = 10, palette = viridis(4))
    color_mapper_diff = LinearColorMapper(low = -10, high = 10, palette = ['red', 'green'])
    #Struktur der Daten für Chart definieren
    source = ColumnDataSource(data=dict(
        rank = [],
        rating = [],
        seasons = [],
        episodes = [],
        title = [],
        iid = [],
        rating_diff = [],
        episodes_rating = []
    ))
    bar_dict = {
        'title' : [],
        'ratings' : [],
        'groups' : []
    }
    titles = []
    #passende/benötigte Daten aus Serien-Dict an Source für Chart übergeben
    #print 'Gathering Figure-Source-Data for Ranking-Chart'
    for name, serie in list(series_data.iteritems()):
        if 'difference_average_episode_to_serie_rating' in serie:

            source.data['rank'].append(serie['rank'])
            source.data['rating'].append(serie['rating_series'])
            source.data['seasons'].append(serie['seasons'])
            source.data['episodes'].append(serie['episodes_total'])
            source.data['title'].append(name)
            source.data['iid'].append(serie['id'])
            source.data['rating_diff'].append(serie['difference_average_episode_to_serie_rating'])
            source.data['episodes_rating'].append(serie['average_episode_rating'])
            titles.append(name)
            bar_dict['title'].append(name)
            bar_dict['title'].append(name)
            bar_dict['title'].append(name)
            bar_dict['ratings'].append(serie['rating_series'])
            bar_dict['ratings'].append(serie['average_episode_rating'])
            bar_dict['ratings'].append(serie['difference_average_episode_to_serie_rating'])
            bar_dict['groups'].append('Serie Rating')
            bar_dict['groups'].append('Episodes Rating')
            bar_dict['groups'].append('Rating Difference (Episodes-Serie)')

    #
    ###Bar-Chart mit Balken für Serien-, Episodes-Rating und Differnez
    bar_chart = Bar(bar_dict, values = 'ratings', label = 'title', group = 'groups', title = 'Rating Difference (Bar-Chart)',
                    xlabel = 'Serie Title', ylabel = 'Rating',  tools = ['hover', 'xwheel_pan', 'reset'], 
                    active_scroll = 'xwheel_pan', plot_width = 900)
    hover_bar = bar_chart.select_one(HoverTool)
    hover_bar.tooltips = [('Title', '@title'), ('Rating', '@height{1.11}'), ('What', '@groups')]

    #kucken ob sinnvoll bei bar-Chart
    #Labels definieren und hinzufügen
    labels = LabelSet(x = 'title', y = 0, text = 'title', y_offset = -25,
                      text_font_size = "16pt", text_color = 'grey', text_baseline = 'hanging',
                      source = source, text_align = 'center')
    bar_chart.add_layout(labels)
    bar_chart.xaxis.major_label_text_font_size = '0pt' #da Serien-Titel im Chart, können die ander Achse weg
    
    
    #unötige/störende Outline weg
    bar_chart.outline_line_color = None
    
    #Achsen anpassen
    bar_chart.xaxis.axis_line_color = None
    bar_chart.yaxis.axis_line_color = None
    bar_chart.xaxis.major_tick_line_color = None

    #
    ###Figure mit Linie und Kreis für Rating-Differenz
    p = figure(y_range = titles, x_range = (-3,3), tools = ['hover', 'tap', 'pan', 'ywheel_pan', 'xwheel_zoom', 'reset'],
               active_scroll = 'ywheel_pan', 
               title = 'Difference between episodes-average- and series-rating', x_axis_location = 'above', plot_width = 900)
    p.title.text_font_size = '18pt'

    hover = p.select_one(HoverTool)


    p.segment(0, 'title', 'rating_diff', 'title', line_width = 3, 
              line_color = {'field': 'rating_diff', 'transform': color_mapper_diff},
              source = source)
    p.circle('rating_diff', 'title', size = 25, fill_color = {'field': 'rating_diff', 'transform': color_mapper_diff}, 
             line_color = {'field': 'rating_diff', 'transform': color_mapper_diff}, line_width=3, source = source)

    #Definieren der Tooltips beim Hovern
    hover.tooltips = [('Title', '@title'), ('Serie Rating', '@rating{1.11}'),
                      ('Episodes Rating', '@episodes_rating{1.11}'), ('Rating Difference', '@rating_diff{1.11}')]


    #Labels bei Differenz-only-Chart für Serien Titel
    labels = LabelSet(x = 0, y = 'title', text = 'title', text_align = 'center', text_color = 'black',
                      level = 'glyph', y_offset = 20, source = source)
    p.add_layout(labels)
    p.yaxis.major_label_text_font_size = '0pt' #da Serien-Titel im Chart, können die ander Achse weg

    #Grids anpassen
    p.xgrid.band_fill_alpha = 0.2
    p.xgrid.band_fill_color = 'black'
    p.ygrid.grid_line_color = None
    p.xgrid.grid_line_color = 'grey'
    p.xgrid.ticker = FixedTicker(ticks = range(-10, 11, 10))

    #Achsen benennen und deren Nummerierung anpassen
    p.xaxis.axis_label = 'Rating Difference (Episodes - Serie)'
    p.yaxis.axis_label = 'Serie Title'
    p.yaxis.major_tick_line_color = None
    p.xaxis.axis_line_color = None
    p.yaxis.axis_line_color = None

    #unötige/störende Outline weg
    p.outline_line_color = None

    #Chart anzeigen
    #show(p)
    #show(bar_chart)
    l = layout([[p], [bar_chart]])
    #show(l)
    print 'requested figure completed'
    return l
print 'definiert'
definiert

Serien-Rank-Chart und UI

In [12]:
#@toggleInputDefault
#@toggleOutput
plot = figure(x_range = (0, 10), y_range = (10, 0),tools = ['hover', 'tap', 'ywheel_pan', 'ypan'], active_scroll = 'ywheel_pan', 
           title = 'Popular Series on IMDb', x_axis_location = 'above', plot_width = 900)
plot.title.text_font_size = '18pt'
color_mapper = LinearColorMapper(low = 0.0, high = 10.0, palette = viridis(256))
hover = plot.select_one(HoverTool)

#Struktur der Daten für Chart definieren
source = ColumnDataSource(data=dict(
    rank = [],
    rating = [],
    seasons = [],
    episodes = [],
    title = [],
))
#passende/benötigte Daten aus Serien-Dict an Source für Chart übergeben
for name, serie in list(series_data.iteritems()):
    source.data['rank'].append(serie['rank'])
    source.data['rating'].append(serie['rating_series'])
    source.data['seasons'].append(serie['seasons'])
    source.data['episodes'].append(serie['episodes_total'])
    source.data['title'].append(name)

#Horizontale-Balken-Chart aus den Daten generieren
plot.hbar(y = 'rank', height = 0.9, left = 0, right = 'rating', fill_color = {'field': 'rating', 'transform': color_mapper}, 
       line_color = None, source = source)

#Definieren der Tooltips beim Hovern
hover.tooltips = [("Title", "@title"), ("Number of seasons", "@seasons"), ("Number of episodes", "@episodes"),
                  ("Rating", "@rating{1.1}")]

#Defineren des Selektierenverhaltens
#kann auch gleich in p.hbar() definiert werden
#evtl. sinnlos und sollte weg?
selected_bar = HBar(fill_alpha = 1)
nonselected_bar = HBar(fill_alpha = 0)
Renderer.selection_glyph = selected_bar
Renderer.nonselection_glyph = nonselected_bar

#Labels mit den Serien-Titeln definieren und hinzufügen
labels = LabelSet(x = 0.1, y = 'rank', text = 'title',
                  text_font_size = "16pt", text_color = 'black',
                  source = source, text_align = 'left', text_baseline = 'middle')
plot.add_layout(labels)

#Grids anpassen
plot.xgrid.band_fill_alpha = 0.1
plot.xgrid.band_fill_color = 'green'
plot.xgrid.grid_line_color = None
plot.ygrid.grid_line_color = None
plot.ygrid.ticker = FixedTicker(ticks = range(0, 11, 1))

#Achsen benennen und deren Nummerierung anpassen
plot.xaxis.axis_label = 'IMDb Rating'
plot.yaxis.axis_label = 'IMDb Popularity Rank'
plot.xaxis.ticker = FixedTicker(ticks = range(0, 11, 1))
plot.yaxis.ticker = FixedTicker(ticks = range(1, 101, 1))
plot.xaxis.major_tick_line_color = None
plot.yaxis.major_tick_line_color = None
plot.xaxis.axis_line_color = None
plot.yaxis.axis_line_color = None

#unötige/störende Outline weg
plot.outline_line_color = None

#ColorBar zur Erklärung der Balkenfarben hinzufügen
color_bar = ColorBar(color_mapper = color_mapper, #ticker = ContinuousTicker(),
                     label_standoff = 12, border_line_color = None, location = (0,0), orientation = 'horizontal')

plot.add_layout(color_bar, 'below')

#Chart zeigen
show(plot)

###
#Widgets UI und Funktionen zur Generierung zusätzlicher Serie-/Episoden-Plots durch den Nutzer
###

def btn_clk_episodes(widget):
    #print dropdownWidget.value
    show(serie_episodes_figure(dropdownWidget.value))
    #notebook Ausgabe aktualisieren
    #push_notebook(handle = handle)

def btn_clk_comparison(widget):
    show(rating_difference_comparison_figure())

#simplerer Serie-Dropdown-Liste aus Serien Dict erstellen
dropdown = OrderedDict()
first_value = 0
#print 'Generating DropdownWidget (rankordered)'
for name, serie in sorted(series_data.items(), key=lambda x: x[1]['rank']):
    dropdown[str(serie['rank'])+' : '+name] = serie['id']
    if first_value == 0:
        first_value = serie['id']

dropdownWidget = widgets.Dropdown(options = dropdown, value = first_value, description = 'Select Serie:', 
                                  layout = widgets.Layout(flex = '1 1 auto'), width = 'auto')

buttonWidget = widgets.Button(description = 'update Serie/Episode Chart',
                             layout = widgets.Layout(flex = '1 1 auto'), width = 'auto')
buttonWidget.on_click(btn_clk_episodes)

buttonWidgetDiffComp = widgets.Button(description = 'Comparison chart: Rating Difference (of updated series)',
                             layout = widgets.Layout(flex = '1 1 auto'), width = 'auto')
buttonWidgetDiffComp.on_click(btn_clk_comparison)

#Widgets layouten
box_layout = widgets.Layout(display = 'flex', flex_flow = 'row', align_items = 'stretch', width = '75%', 
                            justify_content = 'center')
box = widgets.Box(children = [dropdownWidget, buttonWidget], layout = box_layout)

box_layout2 = widgets.Layout(display='flex', flex_flow='column', align_items='stretch', width='75%')

box2 = widgets.Box(children = [box, buttonWidgetDiffComp], layout = box_layout2)

display(box2)
#display(dropdownWidget, buttonWidget)
Get IMDbs Episodes Information for Serie 0944947
100%|██████████| 8/8 [00:00<00:00, 8000.58it/s]
requested figure completed
Get IMDbs Episodes Information for Serie 5016504
100%|██████████| 2/2 [00:57<00:00, 28.53s/it]
requested figure completed
Get IMDbs Episodes Information for Serie 2372162
100%|██████████| 7/7 [00:00<00:00, 3499.84it/s]
requested figure completed
Get IMDbs Episodes Information for Serie 0898266
100%|██████████| 12/12 [00:00<00:00, 2400.06it/s]
requested figure completed
requested figure completed

Ende