#!/usr/bin/env python

# $LastChangedDate: 2008-12-30 12:21:51 +0100 (Tue, 30 Dec 2008) $
# $Rev: 92 $
# $Author: pauldebruin $

"""
SYNOPSIS

tv_grab_nl_py is a python script that trawls tvgids.nl for TV
programming information and outputs it in XMLTV-formatted output (see
http://membled.com/work/apps/xmltv). Users of MythTV
(http://www.mythtv.org) will appreciate the output generated by this
grabber, because it fills the category fields, i.e. colors in the EPG,
and has logos for most channels automagically available. Check the
website below for screenshots.  The newest version of this script can be
found here: 

     http://code.google.com/p/tvgrabnlpy/

USAGE

Check the web site above and/or run script with --help and start from there

HISTORY

tv_grab_nl_py used to be called tv_grab_nl_pdb, first released on
2003/07/09. The name change was necessary because more and more people
are actively contributing to this script and I always disliked using my
initials (I was just too lazy to change it). At the same time I switched
from using CVS to SVN and as a result the version numbering scheme has
changed. The lastest official release of tv_grab_nl_pdb is 0.48. The
first official release of tv_grab_nl_py is 6.

QUESTIONS

Questions (and patches) are welcome at: paul at pwdebruin dot net.

IMPORTANT NOTES

If you were using tv_grab_nl from the XMLTV bundle then enable the
compat flag or use the --compat command-line option.  Otherwise, the
xmltvid's are wrong and you will not see any new data in MythTV.

CONTRIBUTORS

Main author: Paul de Bruin (paul at pwdebruin dot net)

Michel van der Laan made available his extensive collection of
high-quality logos that is used by this script. 

Michael Heus has taken the effort to further enhance this script so that
it now also includes:
 - Credit info: directors, actors, presenters and writers
 - removal of programs that are actually just groupings/broadcasters 
   (e.g. "KETNET", "Wild Friday", "Z@pp")
 - Star-rating for programs tipped by tvgids.nl
 - Black&White, Stereo and URL info
 - Better detection of Movies
 - and much, much more... 

Several other people have provided feedback and patches (these are the
people I could find in my email archive, if you are missing from this
list let me know):
Huub Bouma, Roy van der Kuil, Remco Rotteveel, Mark Wormgoor, Dennis van
Onselen, Hugo van der Kooij, Han Holl, Ian Mcdonald, Udo van den Heuvel.

""" 

# Modules we need
import re, urllib2, getopt, sys
import time, random
import htmlentitydefs, os, os.path, pickle
from string import replace, split, strip
from threading import Thread
from xml.sax import saxutils

# Extra check for the datetime module 
try:
    import datetime
except:
    sys.stderr.write('This script needs the datetime module that was introduced in Python version 2.3.\n')
    sys.stderr.write('You are running:\n')
    sys.stderr.write('%s\n' % sys.version)
    sys.exit(1)


# do extra debug stuff
debug = 1

try:
    import redirect
except:
    debug = 0
    pass

# globals
# compile only one time
r_entity = re.compile(r'&(#x[0-9A-Fa-f]+|#[0-9]+|[A-Za-z]+);')

tvgids = 'http://www.tvgids.nl/'
uitgebreid_zoeken = tvgids + 'zoeken/'

# how many seconds to wait before we timeout on a 
# url fetch, 10 seconds seems reasonable
global_timeout = 10

# Wait a random number of seconds between each page fetch.
# We want to be nice and not hammer tvgids.nl (these are the 
# friendly people that provide our data...).
# Also, it appears tvgids.nl throttles its output.
# So there, there is not point in lowering these numbers, if you 
# are in a hurry, use the (default) fast mode.
nice_time = [1, 2]

# Maximum length in minutes of gaps/overlaps between programs to correct
max_overlap = 10

# Strategy to use for correcting overlapping prgramming:
# 'average' = use average of stop and start of next program
# 'stop'    = keep stop time of current program and adjust start time of next program accordingly
# 'start'   = keep start time of next program and adjust stop of current program accordingly
# 'none'    = do not use any strategy and see what happens
overlap_strategy = 'average'

# Experimental strategy for clumping overlapping programming, all programs that overlap more
# than max_overlap minutes, but less than the length of the shortest program are clumped 
# together. Highly experimental and disabled for now.
do_clump = False

# Create a category translation dictionary
# Look in mythtv/themes/blue/ui.xml for all category names
# The keys are the categories used by tvgids.nl (lowercase please)
cattrans = { 'amusement'        : 'Talk',
             'animatie'         : 'Animated',
             'comedy'           : 'Comedy',
             'documentaire'     : 'Documentary',
             'educatief'        : 'Educational',
             'erotiek'          : 'Adult',
             'film'             : 'Film',
             'muziek'           : 'Art/Music',
             'informatief'      : 'Educational',
             'jeugd'            : 'Children',
             'kunst/cultuur'    : 'Arts/Culture',
             'misdaad'          : 'Crime/Mystery',
             'muziek'           : 'Music',
             'natuur'           : 'Science/Nature',
             'nieuws/actualiteiten' : 'News',
             'overige'          : 'Unknown',
             'religieus'        : 'Religion',
             'serie/soap'       : 'Drama',
             'sport'            : 'Sports',
             'theater'          : 'Arts/Culture',
             'wetenschap'       : 'Science/Nature'}

# Create a role translation dictionary for the xmltv credits part
# The keys are the roles used by tvgids.nl (lowercase please)
roletrans = {'regie'             : 'director',
             'acteurs'           : 'actor',
             'presentatie'       : 'presenter',
             'scenario'          : 'writer'}

# We have two sources of logos, the first provides the nice ones, but is not 
# complete. We use the tvgids logos to fill the missing bits.
logo_provider = [ 'http://visualisation.tudelft.nl/~paul/logos/gif/64x64/',
                  'http://static.tvgids.nl/gfx/zenders/' ]

logo_names = { 
            1 : [0, 'ned1'],
            2 : [0, 'ned2'],
            3 : [0, 'ned3'],
            4 : [0, 'rtl4'],
            5 : [0, 'een'],
            6 : [0, 'canvas_color'],
            7 : [0, 'bbc1'],
            8 : [0, 'bbc2'],
            9 : [0,'ard'],
            10 : [0,'zdf'],
            11 : [1, 'rtl'],
            12 : [0, 'wdr'],
            13 : [1, 'ndr'],
            14 : [1, 'srsudwest'],
            15 : [1, 'rtbf1'],
            16 : [1, 'rtbf2'],
            17 : [0, 'tv5'],
            18 : [0, 'ngc'],
            19 : [1, 'eurosport'],
            20 : [1, 'tcm'],
            21 : [1, 'cartoonnetwork'],
            24 : [0, 'canal+red'],
            25 : [0, 'mtv-color'],
            26 : [0, 'cnn'],
            27 : [0, 'rai'],
            28 : [1, 'sat1'],
            29 : [0, 'discover-spacey'],
            31 : [0, 'rtl5'],
            32 : [1, 'trt'],
            34 : [0, 'veronica'],
            35 : [0, 'tmf'],
            36 : [0, 'sbs6'],
            37 : [0, 'net5'],
            38 : [1, 'arte'],
            39 : [0, 'canal+blue'],
            40 : [0, 'at5'],
            46 : [0, 'rtl7'],
            49 : [1, 'vtm'],
            50 : [1, '3sat'],
            58 : [1, 'pro7'],
            59 : [1, 'kanaal2'],
            60 : [1, 'vt4'],
            65 : [0, 'animal-planet'],
            73 : [1, 'mezzo'],
            86 : [0, 'bbc-world'],
            87 : [1, 'tve'],
            89 : [1, 'nick'],
            90 : [1, 'bvn'],
            91 : [0, 'comedy_central'],
            92 : [0, 'rtl8'],
            99 : [1, 'sport1_1'],
            100 : [0, 'rtvu'],
            101 : [0, 'tvwest'],
            102 : [0, 'tvrijnmond'],
            103 : [1, 'tvnoordholland'],
            104 : [1, 'bbcprime'],
            105 : [1, 'spiceplatinum'],
            107 : [0, 'canal+yellow'],
            108 : [0, 'tvnoord'],
            109 : [0, 'omropfryslan'],
            114 : [0, 'omroepbrabant']}

# A selection of user agents we will impersonate, in an attempt to be less
# conspicuous to the tvgids.nl police.

user_agents = [ 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
       'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.9) Gecko/20071025 Firefox/2.0.0.9',
       'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
       'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.0.7) Gecko/20060909 Firefox/1.5.0.7',
       'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)',
       'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.9) Gecko/20071105 Firefox/2.0.0.9',
       'Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US; rv:1.8.1.9) Gecko/20071025 Firefox/2.0.0.9',
       'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.8) Gecko/20071022 Ubuntu/7.10 (gutsy) Firefox/2.0.0.8'
       ]


# Work in progress, the idea is to cache program categories and
# descriptions to eliminate a lot of page fetches from tvgids.nl
# for programs that do not have interesting/changing descriptions

class ProgramCache:
    """
    A cache to hold program name and category info.
    TVgids stores the detail for each program on a separate URL with an
    (apparently unique) ID. This cache stores the fetched info with the ID.
    New fetches will use the cached info instead of doing an (expensive)
    page fetch.
    """
    def __init__(self, filename=None):
        """
        Create a new ProgramCache object, optionally from file 
        """

        # where we store our info
        self.filename  = filename

        if filename == None:
            self.pdict = {}
        else:
            if os.path.isfile(filename):
                self.load(filename)
            else:
                self.pdict = {}


    def load(self, filename):
        """
        Loads a pickled cache dict from file
        """
        self.pdict = pickle.load(open(filename,'r'))

    def dump(self, filename):
        """
        Dumps a pickled cache, and makes sure it is valid
        """
        if os.access(filename, os.F_OK):
            try:
                os.remove(filename)
            except:
                sys.stderr.write('Cannot remove %s, check permissions' % filename)
        pickle.dump(self.pdict, open(filename+'.tmp', 'w'))
        os.rename(filename+'.tmp', filename)

    
    def query(self, program_id):
        """
        Updates/gets/whatever.
        """

        try:
            return self.pdict[program_id]
        except:
            return None

    def add(self, program):
        """
        Adds a program
        """
        self.pdict[program['ID']] = program

    def clear(self):
        """
        Clears the cache (i.e. empties it)
        """
        self.pdict = {}

    def clean(self):
        """
        Removes all cached programming before today.
        Also removes erroneously cached programming.
        """
        now = time.localtime() 
        dnow = datetime.datetime(now[0],now[1],now[2])
        for key in self.pdict.keys():
            try:
                if self.pdict[key]['stop-time'] < dnow or self.pdict[key]['name'].lower() == 'onbekend':
                    del self.pdict[key]
            except:
                pass    


def usage():
    print 'tv_grab_nl_py: A grabber that grabs tvguide data from tvgids.nl\n'
    print 'and stores it in XMLTV-combatible format.\n'
    print 'Usage:'
    print '--help, -h    = print this info'
    print '--configure   = create configfile (overwrites existing file)'
    print '--config-file = name of the configuration file (default = ~/.xmltv/tv_grab_py.conf'
    print '--capabilities = xmltv required option'
    print '--desc-length = maximum allowed length of programme descriptions in bytes.'
    print '--description = prints a short description of the grabber'
    print '--output      = file where to put the output'
    print '--days        = # number of days to grab'
    print '--preferredmethod = returns the preferred method to be called'
    print '--slow        = also grab descriptions of programming'
    print '--quiet       = suppress all output'
    print '----'
    print '--compat      = append tvgids.nl to the xmltv id (use this if you were using tv_grab_nl)'
    print '--logos       = insert urls to channel icons (mythfilldatabase will then use these)'
    print '--nocattrans  = do not translate the grabbed genres into MythTV-genres'
    print '--cache       = cache descriptions and use the file to store'
    print '--clean_cache = clean the cache file before fetching'
    print '--clear_cache = empties the cache file before fetching data'
    print '--slowdays    = grab slowdays initial days and the rest in fast mode'
    print '--max_overlap = maximum length of overlap between programming to correct [minutes]'
    print '--overlap_strategy = what strategy to use to correct overlaps (check top of source code)'


def filter_line_identity(m, defs=htmlentitydefs.entitydefs):
    # callback: translate one entity to its ISO Latin value
    k = m.group(1)
    if k.startswith("#") and k[1:] in xrange(256):
        return chr(int(k[1:]))

    try:
        return defs[k]
    except KeyError:
        return m.group(0) # use as is

def filter_line(s):
    """
    Removes unwanted stuff in strings (adapted from tv_grab_be)
    """

    # do the latin1 stuff
    s = r_entity.sub(filter_line_identity, s)

    s = replace(s,'&nbsp;',' ')

    # Ik vermoed dat de volgende drie regels overbodig zijn, maar ze doen
    # niet veel kwaad -- Han Holl
    s = replace(s,'\r',' ')
    x = re.compile('(<.*?>)') # Udo
    s = x.sub('', s) #Udo

    s = replace(s, '~Q', "'")
    s = replace(s, '~R', "'")

    # Hmm, not sure if I understand this. Without it, mythfilldatabase barfs
    # on program names like "Steinbrecher &..."
    # We most create valid XML -- Han Holl
    s = saxutils.escape(s)

    return s
    

def calc_timezone(t):
    """
    Takes a time from tvgids.nl and formats it with all the required
    timezone conversions.
    in: '20050429075000'
    out:'20050429075000 (CET|CEST)'

    Until I have figured out how to correctly do timezoning in python this method
    will bork if you are not in a zone that has the same DST rules as 'Europe/Amsterdam'.

    """

    year = int(t[0:4])
    month = int(t[4:6])
    day = int(t[6:8])
    hour = int(t[8:10])
    minute = int(t[10:12])

    #td = {'CET': '+0100', 'CEST': '+0200'}
    #td = {'CET': '+0100', 'CEST': '+0200', 'W. Europe Standard Time' : '+0100', 'West-Europa (standaardtijd)' : '+0100'}
    td = {0 : '+0100', 1 : '+0200'}

    pt = time.mktime((year,month,day,hour,minute,0,0,0,-1))
    timezone=''
    try:
        #timezone = time.tzname[(time.localtime(pt))[-1]]
        timezone = (time.localtime(pt))[-1]
    except:
        sys.stderr.write('Cannot convert time to timezone')

    return t+' %s' % td[timezone]

def format_timezone(td):
    """
    Given a datetime object, returns a string in XMLTV format
    """
    tstr = td.strftime('%Y%m%d%H%M00')
    return calc_timezone(tstr)

def get_page_internal(url, quiet=0):
    """
    Retrieves the url and returns a string with the contents.
    Optionally, returns None if processing takes longer than
    the specified number of timeout seconds.
    """
    txtdata = None
    txtheaders = {'Keep-Alive' : '300',
                  'User-Agent' : user_agents[random.randint(0, len(user_agents)-1)] }
    try:
        #fp = urllib2.urlopen(url)
        rurl = urllib2.Request(url, txtdata, txtheaders)
        fp = urllib2.urlopen(rurl)
        lines = fp.readlines()
        page = "".join(lines)
        return page
    except:
        if not quiet:
            sys.stderr.write('Cannot open url: %s\n' % url)
        return None

class FetchURL(Thread):
    """
    A simple thread to fetch a url with a timeout
    """
    def __init__ (self, url, quiet=0):
        Thread.__init__(self)
        self.quiet = quiet
        self.url = url
        self.result = None

    def run(self):
        self.result = get_page_internal(self.url, self.quiet)

def get_page(url, quiet=0):
    """
    Wrapper around get_page_internal to catch the
    timeout exception
    """
    try: 
        fu = FetchURL(url, quiet)
        fu.start()
        fu.join(global_timeout)
        return fu.result
    except:
        if not quiet:
            sys.stderr.write('get_page timed out on (>%s s): %s\n' % (global_timeout, url))
        return None

def get_channels(file, quiet=0):
    """
    Get a list of all available channels and store these
    in a file.
    """
    # store channels in a dict
    channels = {}

    # tvgids stores several instances of channels, we want to
    # find all the possibile channels
    channel_get = re.compile('<optgroup label=.*?>(.*?)</optgroup>', re.DOTALL)

    # this is how we will find a (number, channel) instance
    channel_re  = re.compile('<option value="([0-9]+)" >(.*?)</option>', re.DOTALL)

    # this is where we will try to find our channel list
    total = get_page(uitgebreid_zoeken, quiet)
    if total == None:
        return

    # get a list of match objects of all the <select blah station>
    stations = channel_get.finditer(total)

    # and create a dict of number, channel_name pairs
    # we do this this way because several instances of the 
    # channel list are stored in the url and not all of the 
    # instances have all the channels, this way we get them all.
    for station in stations:
        m = channel_re.finditer(station.group(0))           
        for p in m:
            try:
                a = int(p.group(1))
                b = p.group(2)
                channels[a] = b
            except:
                sys.stderr.write('Oops, [%s,%s] does not look like a valid                                                        channel, skipping it...\n' % (p.group(1),p.group(2)))

    # sort on channel number (arbitrary but who cares)
    keys = channels.keys()
    keys.sort()

    # and create a file with the channels
    f = open(file,'w')
    for k in keys:
        f.write("%s %s\n" % (k, channels[k]))
    f.close()

def get_channel_all_days(channel, days, quiet=0):
    """
    Get all available days of programming for channel number

    The output is a list of programming in order where each row
    contains a dictionary with program information.
    """
    
    now = datetime.datetime.now()
    
    programs = []

    # Tvgids shows programs per channel per day, so we loop over the number of days
    # we are required to grab
    for offset in range(0, days):
    
        channel_url = 'http://www.tvgids.nl/zoeken/?d=%i&z=%s' % (offset, channel)

        # For historic purposes, the old style url that gave us a full week in advance:
        #       channel_url = 'http://www.tvgids.nl/zoeken/?trefwoord=Titel+of+trefwoord&interval=0&timeslot='+\
        #           '&station=%s&periode=%i&genre=&order=0' % (channel,days-1)
        # Sniff, we miss you...

        if offset > 0:
                time.sleep(random.randint(nice_time[0], nice_time[1]))

        # get the raw programming for the day
        total = get_page(channel_url, quiet)

        if total == None:
                return programs

        # Setup a number of regexps

        # checktitle will match the title row in H2 tags of the daily overview page, e.g.
        #    <h2>zondag 19 oktober 2008</h2>
        checktitle = re.compile('<h2>(.*?)</h2>',re.DOTALL)

        # getrow will locate each row with program details 
        getrow = re.compile('<a href="/programma/(.*?)</a>',re.DOTALL)

        # parserow matches the required program info, with groups:
        # 1 = program ID
        # 2 = broadcast times
        # 3 = program name
        parserow = re.compile('(.*?)/.*<span class="time">(.*?)</span>.*<span class="title">(.*?)</span>', re.DOTALL)

        #  normal begin and end times
        times = re.compile('([0-9]+:[0-9]+) - ([0-9]+:[0-9]+)?')

        # Get the day of month listed on the page as well as the expected date we are grabbing and compare these.
        # If these do not match, we skip parsing the programs on the page and issue a warning.
        #dayno = int(checkday.search(total).group(1))
 
        title = checktitle.search(total).group(1)
        dayno = title.split()[1]
        expected = now + datetime.timedelta(days=offset)
        
        if (not dayno.isdigit() or int(dayno) != expected.day):
            sys.stderr.write('\nOops, did not expect page %s to list programs for "%s", skipping it...\n' % (channel_url,title,))
            continue
        

        # and find relevant programming info
        allrows = getrow.finditer(total)

        for r in allrows:
                detail = parserow.search(r.group(1))

                if detail != None: 

                        # default times
                        start_time = None
                        stop_time  = None

                        # parse for begin and end times
                        t  = times.search(detail.group(2))

                        if t != None:
                                start_time = t.group(1)
                                stop_time  = t.group(2)

                        program_url  = 'http://www.tvgids.nl/programma/' + detail.group(1) + '/'
                        program_name = detail.group(3)

                        # store time, name and detail url in a dictionary 
                        tdict = {}
                        tdict['start'] = start_time
                        tdict['stop']  = stop_time
                        tdict['name']  = program_name
                        if tdict['name'] == '':
                                tdict['name'] = 'onbekend'
                        tdict['url']   = program_url
                        tdict['ID']    = detail.group(1)
                        tdict['offset'] = offset

                        #Add star rating if tipped by tvgids.nl
                        tdict['star-rating'] = '';
                        if r.group(1).find('Tip') != -1:
                                tdict['star-rating'] = '4/5'

                        # and append the program to the list of programs
                        programs.append(tdict)

    # done
    return programs

def make_daytime(time_string, offset=0, cutoff='00:00', stoptime=False):
    """
    Given a string '11:35' and an offset from today,
    return a datetime object. The cuttoff specifies the point where the 
    new day starts.

    Examples:
    In [2]:make_daytime('11:34',0)
    Out[2]:datetime.datetime(2006, 8, 3, 11, 34)

    In [3]:make_daytime('11:34',1)
    Out[3]:datetime.datetime(2006, 8, 4, 11, 34)

    In [7]:make_daytime('11:34',0,'12:00')
    Out[7]:datetime.datetime(2006, 8, 4, 11, 34)

    In [4]:make_daytime('11:34',0,'11:34',False)
    Out[4]:datetime.datetime(2006, 8, 3, 11, 34)

    In [5]:make_daytime('11:34',0,'11:34',True)
    Out[5]:datetime.datetime(2006, 8, 4, 11, 34)

    """
    h,m = [int(x) for x in time_string.split(':')];
    hm = int(time_string.replace(':',''))
    chm = int(cutoff.replace(':',''))

    # check for the cutoff, if the time is before the cutoff then 
    # add a day
    extra_day = 0
    if (hm < chm) or (stoptime==True and hm == chm):
        extra_day = 1

    # and create a datetime object, DST is handled at a later point
    pt = time.localtime()
    dt = datetime.datetime(pt[0],pt[1],pt[2],h,m)
    dt = dt + datetime.timedelta(offset+extra_day)
    return dt

def correct_times(programs, quiet=0):
    """
    Parse a list of programs as generated by get_channel_all_days()  and
    convert begin and end times to xmltv compatible times in datetime objects.  
    """
    if programs == []:
        return programs
    
    # the start time of programming for this day, times *before* this time are 
    # assumed to be on the next day
    day_start_time = '06:00'

    # initialise using the start time of the first program on this day
    if programs[0]['start'] != None:
        day_start_time = programs[0]['start']

    for program in programs:
        if program['start'] == program['stop']:
            program['stop'] = None

        # convert the times 
        if program['start'] != None:
            program['start-time'] = make_daytime(program['start'], program['offset'], day_start_time)
        else:
            program['start-time'] = None

        if program['stop'] != None:
            program['stop-time'] = make_daytime(program['stop'], program['offset'], day_start_time, stoptime=True)

            # extra correction, needed because the stop time of a program may be on the next day, after the
            # day cutoff. For example: 
            # 06:00 - 23:40 Long Program
            # 23:40 - 00:10 Lala
            # 00:10 - 08:00 Wawa 
            # This puts the end date of Wawa on the current, instead of the next day. There is no way to detect
            # this with a single cutoff in make_daytime. Therefore, check if there is a day difference between
            # start and stop dates and correct if necessary.
            if program['start-time'] != None:
                # make two dates
                start = program['start-time']
                stop  = program['stop-time']
                single_day = datetime.timedelta(1)
                startdate = datetime.datetime(start.year,start.month,start.day)
                stopdate  = datetime.datetime(stop.year,stop.month,stop.day)
                if startdate - stopdate == single_day:
                    program['stop-time'] = program['stop-time'] + single_day
        else:
            program['stop-time'] = None

def parse_programs(programs, offset=0, quiet=0):
    """
    Parse a list of programs as generated by get_channel_all_days()  and
    convert begin and end times to xmltv compatible times.  
    """

    # good programs
    good_programs = []

    # calculate absolute start and stop times
    correct_times(programs, quiet)

    # next, correct for missing end time and copy over all good programming to the 
    # good_programs list
    for i in range(len(programs)):

        # Try to correct missing end time by taking start time from next program on schedule
        if (programs[i]['stop-time'] == None and i < len(programs)-1):
            if not quiet:
                sys.stderr.write('Oops, "%s" has no end time. Trying to fix...\n' % programs[i]['name'])
            programs[i]['stop-time'] = programs[i+1]['start-time']

        # The common case: start and end times are present and are not
        # equal to each other (yes, this can happen)
        if programs[i]['start-time'] != None and \
           programs[i]['stop-time']  != None and \
           programs[i]['start-time'] != programs[i]['stop-time']:
            good_programs.append(programs[i])

    # Han Holl: try to exclude programs that stop before they begin
    for i in range(len(good_programs)-1,-1,-1):
        if good_programs[i]['stop-time'] <= good_programs[i]['start-time']:
            if not quiet:
                sys.stderr.write('Deleting invalid stop/start time: %s\n' % good_programs[i]['name'])
            del good_programs[i]

    # Try to exclude programs that only identify a group or broadcaster and have overlapping start/end times with
    # the actual programs
    for i in range(len(good_programs)-2,-1,-1):
          
        if good_programs[i]['start-time'] <= good_programs[i+1]['start-time'] and \
           good_programs[i]['stop-time']  >= good_programs[i+1]['stop-time']:
            if not quiet:
                sys.stderr.write('Deleting grouping/broadcaster: %s\n' % good_programs[i]['name'])
            del good_programs[i]

    for i in range(len(good_programs)-1):

        # PdB: Fix tvgids start-before-end x minute interval overlap.  An overlap (positive or
        # negative) is halved and each half is assigned to the adjacent programmes. The maximum
        # overlap length between programming is set by the global variable 'max_overlap' and is 
        # default 10 minutes. Examples:
        #
        # Positive overlap (= overlap in programming):
        #   10:55 - 12:00 Lala
        #   11:55 - 12:20 Wawa
        # is transformed in:
        #   10:55 - 11.57 Lala
        #   11:57 - 12:20 Wawa
        # 
        # Negative overlap (= gap in programming):
        #   10:55 - 11:50 Lala
        #   12:00 - 12:20 Wawa
        # is transformed in:
        #   10:55 - 11.55 Lala
        #   11:55 - 12:20 Wawa
         
        stop  = good_programs[i]['stop-time']
        start = good_programs[i+1]['start-time']
        dt    = stop-start
        avg   = start + dt / 2
        overlap = 24*60*60*dt.days + dt.seconds

        # check for the size of the overlap
        if 0 < abs(overlap) <= max_overlap*60:
            if not quiet:
                if overlap > 0:
                    sys.stderr.write('"%s" and "%s" overlap %s minutes. Adjusting times.\n' % \
                        (good_programs[i]['name'],good_programs[i+1]['name'],overlap / 60))
                else:
                    sys.stderr.write('"%s" and "%s" have gap of %s minutes. Adjusting times.\n' % \
                        (good_programs[i]['name'],good_programs[i+1]['name'],abs(overlap) / 60))

            # stop-time of previous program wins
            if overlap_strategy == 'stop':
               good_programs[i+1]['start-time'] = good_programs[i]['stop-time']
            # start-time of next program wins
            elif overlap_strategy == 'start':
               good_programs[i]['stop-time'] = good_programs[i+1]['start-time']
            # average the difference
            elif overlap_strategy == 'average':
               good_programs[i]['stop-time']    = avg
               good_programs[i+1]['start-time'] = avg
            # leave as is
            else:
               pass

    # Experimental strategy to make sure programming does not disappear. All programs that overlap more
    # than the maximum overlap length, but less than the shortest length of the two programs are 
    # clumped.
    if do_clump:
        for i in range(len(good_programs)-1):
         
            stop  = good_programs[i]['stop-time']
            start = good_programs[i+1]['start-time']
            dt    = stop-start
            overlap = 24*60*60*dt.days + dt.seconds

            length0 = good_programs[i]['stop-time']   - good_programs[i]['start-time']
            length1 = good_programs[i+1]['stop-time'] - good_programs[i+1]['start-time']

            l0 = length0.days*24*60*60 + length0.seconds    
            l1 = length1.days*24*60*60 + length0.seconds    

            if abs(overlap) >= max_overlap*60 <= min(l0,l1)*60 and \
                not good_programs[i].has_key('clumpidx')   and \
                not good_programs[i+1].has_key('clumpidx'):
                good_programs[i]['clumpidx']   = '0/2'
                good_programs[i+1]['clumpidx'] = '1/2'
                good_programs[i]['stop-time'] = good_programs[i+1]['stop-time']
                good_programs[i+1]['start-time'] = good_programs[i]['start-time']
            

    # done, nothing to see here, please move on 
    return good_programs

def get_descriptions(programs, program_cache=None, nocattrans=0, quiet=0, slowdays=0):
    """
    Given a list of programs, from get_channel, retrieve program information
    """

    # This regexp tries to find details such as Genre, Acteurs, Jaar van Premiere etc.
    detail      = re.compile('<li>.*?<strong>(.*?):</strong>.*?<br />(.*?)</li>', re.DOTALL)

    # These regexps find the description area, the program type and descriptive text
    description = re.compile('<div class="description">.*?<p class="text">(.*?)</div>',re.DOTALL)
    descrtype = re.compile('<span class="type">(.*?)</span>',re.DOTALL)
    descrline = re.compile('<p>(.*?)</p>',re.DOTALL)

    # randomize detail requests
    nprograms = len(programs)
    fetch_order = range(0,nprograms)
    random.shuffle(fetch_order)

    counter = 0
    for i in fetch_order:
        counter += 1
        if programs[i]['offset'] >= slowdays:
            continue
        
        if not quiet:
            sys.stderr.write('\n(%3.0f%%) %s: %s ' % (100*float(counter)/float(nprograms), i, programs[i]['name']))

        # check the cache for this program's ID
        cached_program = program_cache.query(programs[i]['ID'])
        if (cached_program != None):
                if not quiet:
                    sys.stderr.write(' [cached]')
                # copy the cached information, except the start/end times, rating and clumping, 
                # these may have changed.
                tstart = programs[i]['start-time']
                tstop  = programs[i]['stop-time']
                rating = programs[i]['star-rating']
                try:
                    clump  = programs[i]['clumpidx']
                except:
                    clump = False
                programs[i] = cached_program
                programs[i]['start-time'] = tstart
                programs[i]['stop-time']  = tstop
                programs[i]['star-rating'] = rating
                if clump:
                    programs[i]['clumpidx'] = clump
                continue
        else:
            # be nice to tvgids.nl
            time.sleep(random.randint(nice_time[0], nice_time[1]))

        # get the details page, and get all the detail nodes
        descriptions = ()
        details = ()
        try:
            if not quiet:
                sys.stderr.write(' [normal fetch]')
            total = get_page(programs[i]['url'])
            details = detail.finditer(total)
                
            descrspan = description.search(total);
            descriptions = descrline.finditer(descrspan.group(1))
        except:
            # if we cannot find the description page, 
            # go to next in the loop
            if not quiet:
                sys.stderr.write(' [fetch failed or timed out]')
            continue
        # define containers
        programs[i]['credits'] = {}
        programs[i]['video']   = {}

        # now parse the details
        
        line_nr = 1;
        
        # First, we try to find the program type in the description section.
        # Note that this is not the same as the generic genres (these are searched later on), but a more descriptive one like "Culinair programma" 
        # If present, we store this as first part of the regular description:

        programs[i]['detail1'] = descrtype.search(descrspan.group(1)).group(1).capitalize()
        if programs[i]['detail1'] != '':
           line_nr = line_nr + 1

        # Secondly, we add one or more lines of the program description that are present.
    
        for descript in descriptions:
            d_str = 'detail' + str(line_nr)
            programs[i][d_str] = descript.group(1)

            # Remove sponsored link from description if present.
            sponsor_pos = programs[i][d_str].rfind('<i>Gesponsorde link:</i>')
            if sponsor_pos > 0:
                programs[i][d_str] = programs[i][d_str][0:sponsor_pos]

            programs[i][d_str] = filter_line(programs[i][d_str]).strip()
            line_nr = line_nr + 1
        
        # Finally, we check out all program details. These are generically denoted as:
        #
        #   <li><strong>(TYPE):</strong><br />(CONTENT)</li> 
        #
        # Some examples:
        #
        #   <li><strong>Genre:</strong><br />16 oktober 2008</li>
        #   <li><strong>Genre:</strong><br />Amusement</li>
                                                                            
        for d in details:
            type = d.group(1).strip().lower()
            content_asis = d.group(2).strip()
            content = filter_line(content_asis).strip()
            
            if content == '':
                continue

            elif type == 'genre':

                # Fix detection of movies based on description as tvgids.nl sometimes 
                # categorises a movie as e.g. "Komedie", "Misdaadkomedie", "Detectivefilm". 
                genre = content;
                if    (programs[i]['detail1'].lower().find('film')      != -1 \
                   or  programs[i]['detail1'].lower().find('komedie')   != -1)\
                   and programs[i]['detail1'].lower().find('tekenfilm') == -1 \
                   and programs[i]['detail1'].lower().find('animatiekomedie') == -1 \
                   and programs[i]['detail1'].lower().find('filmpje')   == -1:
                    genre = 'film'

                if nocattrans:
                    programs[i]['genre'] = genre.title()
                else:
                    try:
                        programs[i]['genre'] = cattrans[genre.lower()]
                    except:
                        programs[i]['genre'] = ''


            # Parse persons and their roles for credit info
            elif roletrans.has_key(type):
                programs[i]['credits'][roletrans[type]] = []

                persons = content_asis.split(',');

                for name in persons:
                    if name.find(':') != -1:
                        name = name.split(':')[1]
                    if name.find('-') != -1:
                        name = name.split('-')[0]
                    if name.find('e.a') != -1:
                        name = name.split('e.a')[0]
                    programs[i]['credits'][roletrans[type]].append(filter_line(name.strip()))

            elif type == 'bijzonderheden':
                if content.find('Breedbeeld') != -1:
                    programs[i]['video']['breedbeeld'] = 1
                if content.find('Zwart') != -1: 
                    programs[i]['video']['blackwhite'] = 1
                if content.find('Teletekst') != -1: 
                    programs[i]['teletekst'] = 1
                if content.find('Stereo') != -1: 
                    programs[i]['stereo'] = 1
            elif type == 'url':
                programs[i]['infourl'] = content
            else:
                # In unmatched cases, we still add the parsed type and content to the program details.
                # Some of these will lead to xmltv output during the xmlefy_programs step
                programs[i][type] = content

        # do not cache programming that is unknown at the time
        # of fetching.
        
        if programs[i]['name'].lower() != 'onbekend':
            program_cache.add(programs[i])

    if not quiet:
        sys.stderr.write('\ndone...\n\n')
                    
    # done
      
def title_split(program):
    """
    Some channels have the annoying habit of adding the subtitle to the title of a program. 
    This function attempts to fix this, by splitting the name at a ': '.
    """
    
    if  (program.has_key('titel aflevering') and program['titel aflevering'] != '')  \
     or (program.has_key('genre') and program['genre'].lower() in ['movies','film']):
       return

 
    colonpos =  program['name'].rfind(': ') 
    if colonpos > 0:
       program['titel aflevering'] = program['name'][colonpos+1:len(program['name'])].strip()
       program['name'] =  program['name'][0:colonpos].strip()

def xmlefy_programs(programs, channel, desc_len, compat=0, nocattrans=0):
    """
    Given a list of programming (from get_channels())
    returns a string with the xml equivalent
    """
    output = []
    for program in programs:

        clumpidx = ''
        try:
            if program.has_key('clumpidx'):
                clumpidx = 'clumpidx="'+program['clumpidx']+'"'
        except:
            print program

        output.append('  <programme start="%s" stop="%s" channel="%s%s" %s> \n' % \
            (format_timezone(program['start-time']), format_timezone(program['stop-time']),\
             channel, compat and '.tvgids.nl' or '', clumpidx))

        output.append('    <title lang="nl">%s</title>\n' % filter_line(program['name']))

        if program.has_key('titel aflevering') and program['titel aflevering'] != '':
                output.append('    <sub-title lang="nl">%s</sub-title>\n' % filter_line(program['titel aflevering']))

        desc = []
        for detail_row in ['detail1','detail2','detail3']:
                if program.has_key(detail_row) and program[detail_row] != '' and program[detail_row].find('geen detailgegevens bekend') == -1:
                        desc.append('%s ' % program[detail_row])
        if desc != []:
                # join and remove newlines from descriptions
                desc_line = "".join(desc).strip()
                desc_line.replace('\n', ' ')
                if len(desc_line) > desc_len: 
                    spacepos = desc_line[0:desc_len-3].rfind(' ') 
                    desc_line = desc_line[0:spacepos] + '...'
                output.append('    <desc lang="nl">%s</desc>\n' % desc_line)
        
        # Process credits section if present.
        # This will generate director/actor/presenter info.
        if program.has_key('credits') and program['credits'] != {}:
            output.append('    <credits>\n')
            for role in program['credits']:
                for name in program['credits'][role]:
                    if name != '':
                        output.append('       <%s>%s</%s>\n' % (role, name, role))
            output.append('    </credits>\n')

        if program.has_key('jaar van premiere') and program['jaar van premiere'] != '':
                output.append('    <date>%s</date>\n' % program['jaar van premiere'])

        if program.has_key('genre') and program['genre'] != '':
                output.append('    <category')
                if nocattrans:
                   output.append(' lang="nl"')
                output.append ('>%s</category>\n' % program['genre'])
        
        if program.has_key('infourl') and program['infourl'] != '':
                output.append('    <url>%s</url>\n' % program['infourl']) 

        if program.has_key('aflevering') and program['aflevering'] != '':
                output.append('    <episode-num system="onscreen">%s</episode-num>\n' % filter_line(program['aflevering']))

        # Process video section if present
        if program.has_key('video') and program['video'] != {}:
            output.append('    <video>\n');
            if program['video'].has_key('breedbeeld'):
                output.append('           <aspect>16:9</aspect>\n')
            if program['video'].has_key('blackwhite'):
                output.append('           <colour>no</colour>\n')
            output.append('    </video>\n')

        if program.has_key('stereo'):
            output.append('    <audio><stereo>stereo</stereo></audio>\n')
 
        if program.has_key('teletekst'):
            output.append('    <subtitles type="teletext" />\n')

        # Set star-rating if applicable
        if program['star-rating'] != '':
             output.append('    <star-rating><value>%s</value></star-rating>\n' % program['star-rating'])
                
        output.append('  </programme>\n')
    
    return "".join(output)

def main():

    # Parse command line options
    try:
        opts, args = getopt.getopt(sys.argv[1:], "h", ["help", "output=", "capabilities", 
                                                       "preferredmethod", "days=", 
                                                       "configure", "slow",
                                                       "cache=", "clean_cache", 
                                                       "slowdays=","compat",
                                                       "desc-length=","description",
                                                       "nocattrans","config-file=",
                                                       "max_overlap=", "overlap_strategy=",
                                                       "clear_cache", "quiet","logos"])
    except getopt.GetoptError:
        usage()
        sys.exit(2)

    # DEFAULT OPTIONS - Edit if you know what you are doing

    # where the output goes
    output      = None
    output_file = None

    # the total number of days to fetch 
    days        = 5

    # fetch data in slow mode, i.e. grab all the detail information,
    # slow means slow, because for each program a web page needs to be fetched
    slow        = 1
    #slow        = 0

    # number of days to fetch in slow mode. For example: --days 5 --slowdays 2, will 
    # fetch the first two days in slow mode (with all the details) and the remaining three
    # days in fast mode.
    slowdays    = 4

    # no output 
    quiet       = 0

    # insert url of channel logo into the xml data, this will be picked up by mythfilldatabase
    logos       = 1

    # enable this option if you were using tv_grab_nl, it adjusts the generated
    # xmltvid's so that everything works.
    compat      = 0
    
    # enable this option if you do not want the tvgids categories being translated into
    # MythTV-categories (genres)
    nocattrans  = 0

    # Maximum number of characters to use for program description.
    # Different values may work better in different versions of MythTV.
    desc_len = 475
 
    # default configuration file locations
    hpath = ''
    if os.environ.has_key('HOME'):
        hpath = os.environ['HOME']
    # extra test for windows users
    elif os.environ.has_key('HOMEPATH'):
        hpath = os.environ['HOMEPATH']

    # hpath = ''
    xmltv_dir   = hpath+'/.xmltv'

    program_cache_file = xmltv_dir+'/program_cache'
    config_file = xmltv_dir+'/tv_grab_nl_py.conf'

    # cache the detail information. 
    program_cache = None
    clean_cache = 1
    clear_cache = 0

    # seed the random generator
    random.seed(time.time())

    for o, a in opts:
        if o in ("-h", "--help"):
            usage()
            sys.exit(1)

        if o == "--quiet":
            quiet = 1;

        if o == "--description":
            print "The Netherlands (tv_grab_nl_py $Rev: 92 $)"
            sys.exit(0)

        if o == "--capabilities":
            print "baseline"
            print "cache"
            print "manualconfig"
            print "preferredmethod"
            sys.exit(0)

        if o == '--preferredmethod':
            print 'allatonce'
            sys.exit(0)

        if o == '--desc-length':
            # Use the requested length for programme descriptions.
            desc_len = int(a)
            if not quiet:
                sys.stderr.write('Using description length: %d\n' % desc_len)

    for o, a in opts:
        if o == "--config-file":
            # use the provided name for configuration
            config_file = a
            if not quiet:
                sys.stderr.write('Using config file: %s\n' % config_file)

    for o, a in opts:
        if o == "--configure":
            # check for the ~.xmltv dir
            if not os.path.exists(xmltv_dir):
                if not quiet:
                    sys.stderr.write('You do not have the ~/.xmltv directory,')
                    sys.stderr.write('I am going to make a shiny new one for you...')
                os.mkdir(xmltv_dir)
            if not quiet:
                sys.stderr.write('Creating config file: %s\n' % config_file)
            get_channels(config_file)
            sys.exit(0)

        if o == "--days":
            # limit days to maximum supported by tvgids.nl
            days = min(int(a),6)

        if o == "--compat":
            compat = 1

        if o == "--nocattrans":
            nocattrans = 1

        if o == "--slow":
            slow = 1

        if o == "--output":
            output_file = a
            try:
                output = open(output_file,'w')
                # and redirect output
                if debug:
                    debug_file = open('/tmp/kaas.xml','w')
                    blah = redirect.Tee(output, debug_file) 
                    sys.stdout = blah
                else:
                    sys.stdout = output
            except:
                if not quiet:
                    sys.stderr.write('Cannot write to outputfile: %s\n' % output_file)
                sys.exit(2)

        if o == "--slowdays":
            # limit slowdays to maximum supported by tvgids.nl
            slowdays = min(int(a),6)
            # slowdays implies slow == 1
            slow = 1

        if o == "--clean_cache":
            clean_cache = 1
        if o == "--clear_cache":
            clear_cache = 1
        if o == "--cache":
            program_cache_file = a
        if o == "--max_overlap":
            max_overlap = int(a)
        if o == "--overlap_strategy":
            overlap_strategy = a

    # get configfile if available
    try:
        f = open(config_file,'r')
    except:
        sys.stderr.write('Config file %s not found.\n' % config_file)
        sys.stderr.write('Re-run me with the --configure flag.\n')
        sys.exit(1)

    #check for cache
    program_cache = ProgramCache(program_cache_file)
    if clean_cache != 0:
        program_cache.clean()
    if clear_cache != 0:
        program_cache.clear()

    # Go!
    channels = {}

    # Read the channel stuff
    for blah in f.readlines():
        blah = blah.lstrip()
        blah = blah.replace('\n','')
        if blah:
            if blah[0] != '#':
                channel = blah.split()
                channels[channel[0]] = " ".join(channel[1:])

    # channels are now in channels dict keyed on channel id

    # print header stuff
    print '<?xml version="1.0" encoding="ISO-8859-1"?>'
    print '<!DOCTYPE tv SYSTEM "xmltv.dtd">'
    print '<tv generator-info-name="tv_grab_nl_py $Rev: 92 $">'

    # first do the channel info
    for key in channels.keys():
        print '  <channel id="%s%s">' % (key, compat and '.tvgids.nl' or '')
        print '    <display-name lang="nl">%s</display-name>' % channels[key]
        if (logos):
            ikey = int(key)
            if logo_names.has_key(ikey):
                full_logo_url = logo_provider[logo_names[ikey][0]]+logo_names[ikey][1]+'.gif'
                print '    <icon src="%s" />' % full_logo_url
        print '  </channel>'

    num_chans = len(channels.keys())
    channel_cnt = 0
    if program_cache != None:
        program_cache.clean()

    fluffy = channels.keys()
    nfluffy = len(fluffy)
    for id in fluffy:
        channel_cnt += 1
        if not quiet:
                sys.stderr.write('\n\nNow fetching %s(xmltvid=%s%s) (channel %s of %s)\n' % \
                    (channels[id], id, (compat and '.tvgids.nl' or ''), channel_cnt, nfluffy))
        info = get_channel_all_days(id,  days, quiet)
        blah = parse_programs(info, None, quiet)

        # fetch descriptions
        if slow:
           get_descriptions(blah, program_cache, nocattrans, quiet, slowdays)
        
        # Split titles with colon in it
        # Note: this only takes place if all days retrieved are also grabbed with details (slowdays=days)
        # otherwise this function might change some titles after a few grabs and thus may result in
        # loss of programmed recordings for these programs.
        if slowdays == days:
            for program in blah:
               title_split(program)

        print xmlefy_programs(blah, id, desc_len, compat, nocattrans)

        # save the cache after each channel fetch 
        if program_cache != None:
            program_cache.dump(program_cache_file)
    
        # be nice to tvgids.nl
        time.sleep(random.randint(nice_time[0], nice_time[1]))
        if program_cache != None:
            program_cache.dump(program_cache_file)

    # print footer stuff
    print "</tv>"

    # close the outputfile if necessary
    if output != None:
        output.close()

    # and return success
    sys.exit(0)

# allow this to be a module
if __name__ == '__main__':
    main()

# vim:tw=0:et:sw=4
