#!/usr/bin/python

"""
yle-dl - Frontend for rtmpdump-yle, the YLE Areena stream downloader

Copyright (C) 2010 Antti Ajanki <antti.ajanki@iki.fi>

This script extracts stream information from a YLE Areena web page and
calls rtmpdump-yle with correct parameters.
"""

import sys
import urllib
import urllib2
import re
import subprocess
import os
import signal
import urlparse

RTMPDUMPYLE_BINARY = '/usr/bin/rtmpdump-yle'
RTMPDUMPYLE_OPTIONS = ['-r', 'rtmp://flashk.yle.fi/AreenaServer', 
                       '-s', 'http://areena.yle.fi/player/Application.swf?build=2', 
                       '-m', '60']
RTMPDUMPYLE_OPTIONS_ARKISTO = \
    ['-r', 'rtmp://flashk.yle.fi/ElavaArkisto',
     '-s', 'http://www.yle.fi/elavaarkisto/flowplayer/flowplayer.commercial-3.2.2-dev2.swf',
     '-m', '60']
HTTP_HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1) Gecko/20090624 Firefox/3.5',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Language': 'fi,en-us;q=0.5',
                'Accept-Charset': 'ISO-88591-1,utf-8;q=0.7,*;q=0.7',
                'Connection': 'close',
                'Referer': 'http://areena.yle.fi/'}

# list of all options that require an argument
ARGOPTS = ('--rtmp', '-r', '--host', '-n', '--port', '-c', '--socks',
           '-S', '--swfUrl', '-s', '--tcUrl', '-t', '--pageUrl', '-p',
           '--app', '-a', '--swfhash', '-w', '--swfsize', '-x', '--swfVfy',
           '-W', '--swfAge', '-X', '--auth', '-u', '--conn', '-C',
           '--flashVer', '-f', '--subscribe', '-d', '--flv', '-o',
           '--timeout', '-m', '--start', '-A', '--stop', '-B', '--token',
           '-T', '--skip', '-k', '--areenaParams', '--ylePassi', '--destdir')

debug = False

def usage():
    """Print the usage message to stderr"""
    print >> sys.stderr, "Usage:"
    print >> sys.stderr, "%s [yle-dl or rtmpdump options] URL" % sys.argv[0]
    print >> sys.stderr, ""
    print >> sys.stderr, "yle-dl options:"
    print >> sys.stderr, ""
    print >> sys.stderr, "--episodes              Download all episodes from the given YLE Areena page"
    print >> sys.stderr, "--latestepisode         Download the latest episode"
    print >> sys.stderr, ""
    print >> sys.stderr, "rtmpdump options:"
    print >> sys.stderr, ""
    subprocess.call([RTMPDUMPYLE_BINARY, '--help'])

def download_page(url):
    """Returns contents of a HTML page at url."""
    if url.find('://') == -1:
        url = 'http://' + url

    request = urllib2.Request(url, headers=HTTP_HEADERS)
    try:
        urlreader = urllib2.urlopen(request)
        charset = urlreader.info().getparam('charset')
        if charset is None:
            charset = 'iso-8859-1'

        return unicode(urlreader.read(), charset, 'replace')
    except urllib2.URLError, exc:
        print >> sys.stderr, "Can't read %s: %s" % (url, str(exc.reason))
        return None
    except ValueError:
        print >> sys.stderr, 'Invalid URL: ' + url
        return None

def encode_url_utf8(url):
    """Encode the path component of url to percent-encoded UTF8."""
    (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url)

    path = path.encode('UTF8')

    # Assume that the path is already encoded if there seems to be
    # percent encoded entities.
    if re.search(r'%[0-9A-Fa-f]{2}', path) is None:
        path = urllib.quote(path, '/+')

    return urlparse.urlunparse((scheme, netloc, path, params, query, fragment))

def replace_entitydefs(content):
    return re.sub(r'&(.*?);', lambda m: unicode(htmlentitydefs.entitydefs[m.group(1)], 'iso-8859-1'), content)

def sane_filename(name):
    x = name.strip().replace('/', '_')
    while x[0] == '.':
        x = x[1:]
    return x

def execute_rtmpdump(args):
    if debug:
        print >> sys.stderr, 'Executing:'
        print >> sys.stderr, ' '.join(args)

    try:
        rtmpdump_process = subprocess.Popen(args)
        return rtmpdump_process.wait()
    except KeyboardInterrupt:
        os.kill(rtmpdump_process.pid, signal.SIGINT)
        rtmpdump_process.wait()
        return 2
    except OSError, exc:
        print >> sys.stderr, "Execution failed:", exc
        return 2


### Areena ###


def download_single_episode(url, parameters):
    """Extracts Areena player params from a web page at url and starts
    a rtmpdump-yle process with additional parameters."""
    html = download_page(url)
    if html is None:
        return 1

    areenaparams = extract_areena_params(html)
    if areenaparams is None:
        print >> sys.stderr, "Can't find AreenaPlayerParams from the page."
        print >> sys.stderr, "Is %s really a YLE Areena video page?" % url
        return 1

    args = [RTMPDUMPYLE_BINARY]
    args += RTMPDUMPYLE_OPTIONS
    args += ['--pageUrl', url]
    args += parameters
    args += ['--areenaParams', areenaparams]

    return execute_rtmpdump(args)

def download_episodes(url, parameters, latest_only):
    """Extract all episodes (or just the latest episode if latest_only
    is True) from url."""
    episodelist = get_episode_pages(url)
    if episodelist is None:
        return 1

    if latest_only:
        episodelist = episodelist[:1]

    if len(episodelist) > 1:
        print >> sys.stderr, 'Downloading %d episodes' % len(episodelist)

    for episodeurl in episodelist:
        print >> sys.stderr, 'Downloading episode at ' + episodeurl
        status = download_single_episode(episodeurl,
                                         parameters + ['--noOverwrite'])
        if status != 0:
            return status

    return 0
    
def extract_areena_params(html):
    m = re.search(r'<div class="AreenaPlayerParams">([^<]*)</div>', html)
    if m is None:
        return None
    else:
        return m.group(1)

def get_episode_pages(url):
    if debug:
        print >> sys.stderr, 'Searching for episodes in %s' % url

    html = download_page(url)
    if html is None:
        return None

    m = re.search(r'<a href="(.*?)">Tilaa (?:RSS|uusimmat)</a>', html)
    if m is None:
        print >> sys.stderr, "No RSS link in %s" % url
        return []

    rssurl = urlparse.urljoin(url, m.group(1))

    if debug:
        print >> sys.stderr, 'Getting episode RSS %s' % rssurl

    rss = download_page(rssurl)

    i = rss.find('<item>')
    if i == -1:
        return []
    rss = rss[i:]
    
    pages = re.findall(r'<link>(.*?)</link>', rss)
    if pages is None:
        return []
    else:
        return pages


### Elava Arkisto ###


def extract_arkisto_playlist(html):
    maintitle = ''
    i = html.find(' id="content-main"')
    if i != -1:
        match = re.search(r'<h1>([^<>]*)', html[i:])
        if match is not None:
            maintitle = match.group(1).strip() + ' - '

    match = re.search(r'<ul class="clips">.*?</ul>', html, re.DOTALL)
    if match is None:
        return []

    items = re.findall(r'<div class="clipdetails">.*?<a .*?id="(.*?)".*?>(.*?)</a>', match.group(), re.DOTALL)

    clips = []
    for clip in items:
        mediaID, subtitle = clip
        title = maintitle + subtitle
        fulltitle = sane_filename(replace_entitydefs(title))

        if mediaID.startswith('mp4'):
            clips.append((mediaID + '_hi.mp4', fulltitle + '.mp4'))
        elif mediaID.startswith('mp3'):
            clips.append((mediaID, fulltitle + '.mp3'))

    return clips
    
def download_single_arkisto_episode(playpath, filename, parameters, pageurl):
    args = [RTMPDUMPYLE_BINARY]
    args += RTMPDUMPYLE_OPTIONS_ARKISTO
    args += ['-y', playpath.encode(sys.getfilesystemencoding()),
             '-p', pageurl.encode(sys.getfilesystemencoding()),
             '-o', filename.encode(sys.getfilesystemencoding(), 'replace')]
    args += parameters

    return execute_rtmpdump(args)

def download_arkisto_episodes(url, parameters, latest_episode):
    html = download_page(url)
    if html is None:
        return 1

    playlist = extract_arkisto_playlist(html)
    if len(playlist) == 0:
        print >> sys.stderr, "Can't find streams at %s." % url
        return 1

    if latest_episode:
        playlist = playlist[0]

    for clip in playlist:
        status = download_single_arkisto_episode(clip[0], clip[1], parameters, url)
        if status != 0:
            return status

    return 0


### main program ###

    
def main():
    global debug
    episodes = False
    latest_episode = False
    argv = sys.argv[1:]

    url = None
    prevarg = ''
    for arg in argv:
        if not arg.startswith('-') and prevarg not in ARGOPTS:
            url = arg
        elif arg in ['--verbose', '-V', '--debug', '-z']:
            debug = True
        elif arg in ['--episodes']:
            episodes = True
        elif arg in ['--latestepisode']:
            latest_episode = True
        prevarg = arg
    if episodes:
        argv.remove('--episodes')
    if latest_episode:
        argv.remove('--latestepisode')
            
    if url is None or '--help' in argv:
        usage()
        sys.exit(1)

    argv.remove(url)

    # Is sys.getfilesystemencoding() the correct encoding for
    # sys.argv?
    encoding = sys.getfilesystemencoding()
    try:
        url = unicode(url, encoding)
    except UnicodeDecodeError:
        print >> sys.stderr, 'Warning: Failed to encode URL!'
        url = unicode(url, 'ascii', 'replace')

    url = encode_url_utf8(url)
    if url.startswith('http://www.yle.fi/elavaarkisto/') or \
            url.startswith('http://yle.fi/elavaarkisto/'):
        sys.exit(download_arkisto_episodes(url, argv, latest_episode))
    elif latest_episode:
        sys.exit(download_episodes(url, argv, True))
    elif episodes:
        sys.exit(download_episodes(url, argv, False))
    else:
        sys.exit(download_single_episode(url, argv))


if __name__ == '__main__':
    main()
