#!/bin/sh
# Usage: ppttitles 01/index.htm 01_3.xml
# Read in 01/01/index.htm and generate a list of the PowerPoint slide titles
# in xml format
# The output will look like
#
# <?xml version="1.0" encoding="utf-8"?>
# <Slides>
#    <Slide href="01/slide1.gif" title="Slide 1"/>
#    <Slide href="01/slide2.gif" title="Slide 2"/>
# </Slides>
# 
# The second argument is optional.  If it is present, then
# it is assumed to name a UTF-8 file that was generated by Windows Media
# File Editor and then converted to UTF-8.  We look in this file
# for Script tags that have Command Attributes and adjust the
# output to match the base of the Command Attribute.  For example
# if the index.htm file contains links to files like sld002.htm, and
# the second argument contains links to files like 01/slide2.gif, then
# we adjust the output so that we have links to files like slide2.gif
# instead of sld002.htm.
#

if [ $# -ne 1 -a $# -ne 2 ]; then
	echo "Usage: $0 index.htm [toc.xml ]"
	echo " Read in index.htm and generate a list of the PowerPoint slide titles"
	exit 9
fi


if [ $# -eq 2 ]; then
   # Look inside argument $2 and determine whether to use Slide or slide
   # We use the second Command= because the first one is sometimes a
   # different from the second one because it might have been added by hand.
commandurl=`awk '$0 ~ / Command="/ {
    commandCount++
    if (commandCount == 2) {
        p = match($0,/ Command=/) + length(" Command=") + 1
        commandEtc = substr($0, p, length($0) - p)
        split(commandEtc, command, "\"");
        np = split(command[1], pa, "/");
        baseurl=pa[np]
        if (baseurl ~ /slide/) {
            print "slide"
        } else if (baseurl ~ /Slide/) {
            print "Slide"
        }
        exit
    }
}' $2`
else
    commandurl=Slide
fi

if [ "$commandurl" = "" ]; then
    echo "$0: Warning, 'commandurl' = '', which is likely a bug" 1>&2
    echo "     The problem is likely in '$2'" 1>&2
fi

grep -i 'a href="sld' $1 |
grep -vi "Click here to start" |
tr "\013\205\222\223\224\225\226\227" " .'' \-\-_" |
sed 's/&/&amp;/g' |
awk  '
BEGIN { print "<?xml version=\"1.0\" encoding=\"utf-8\"?>"
        print "<Slides>"
      }
      { nf=split($0, f, "<")
        # Look for the A HREF tag and print out the part after the >
        for(i=1; i<=nf; i++) {
            if (f[i] ~ /A HREF/) {
                split(f[i], tag, ">")
                split(tag[1], href, "\"")
                if (tag[2] ~ /^PPT Slide$/ && href[2] ~ /^sld/) {
                    talkSlideNumber = substr(href[2],4,length(href[2])-7) 
                    talkSlideNumberFixed = talkSlideNumber + 0
                    tag[2] = "Slide " talkSlideNumberFixed
                }
                if (href[2] ~ /^sld/ && commandurl != "sld") {
                    # Convertd sld to whatever the value of commandurl is,
                    # get rid of leading zeros, so sld001.htm -> slide1.gif
                    slideNumber = substr(href[2],4,length(href[2])-7) 
                    previousSlideNumberFixed = slideNumberFixed
                    slideNumberFixed = slideNumber + 0
                    # We use talkSlideNumberFixed below.
                    # FIXME: We need two variables here, slideNumberFixed and
                    # talkSlideNumberFixed to handle checking against the
                    # the previous slide number.  
                    talkSlideNumberFixed = slideNumberFixed
                    if (slideNumberFixed > previousSlideNumberFixed) {
                        if (mergedPPTFile == 0) {
                            # If we are handling slides for only one
                            # powerpoint file, then we will end up here. 
                            href[2] = commandurl slideNumberFixed ".gif"
                        } else {
                            # We are handling a merged powerpoint file
                            # situation
                            #print "<!-- mergedCount -->"
                            mergedCount++
                            href[2] = commandurl mergedCount ".gif"
                        }
                    } else {
                        # We are handling a file that contains the
                        # concatenation of multiple index.htm files.
                        # This would occur if we combined multiple talks
                        # into a single ppt file and then generated
                        # slides for each individual talk and then
                        # created an htm file with:
                        # cat */*/index.htm > all.htm
                        # However, the talks themselves will have URL
                        # events that point to an every increasing SlideNN.gif
                        # file.
                        # We assume that each talk is separated by a blank
                        # gif.
                        # So, the first talk will have
                        # Slide1.gif ... SlideN.gif
                        # SlideN+1 will be blank
                        # The second talk will have SlideN+2.gif ... SlideM.gif
                        # Thus if slideNumberFixed is less than
                        # the previousSlideNumberFixed, we should set a 
                        # flag indicating we are ignoring the count and using
                        # our own count and increment the count by two
                        # to skip the blank
                        if (mergedPPTFile == 0) {
                            print "<!-- 2nd talk -->"
                            mergedPPTFile = 1
                            mergedCount = previousSlideNumberFixed + 2
                        } else {
                            print "<!-- new talk -->"
                            mergedCount += 2 
                        }
                        href[2] = commandurl mergedCount ".gif"
                    }

                } 
                if (tag[2] == "") {
                    # FIXME: The title is empty, which can cause problems
                    # when playing the stream.  The problem here is that
                    # the title of the slide is likely so long that it is
                    # on the line after the A HREF line.
                    tag[2] = "Slide " talkSlideNumberFixed
                } 
                print "    <Slide href=\"" href[2] "\" title=\"" tag[2] "\"/>"
            }
        }
}
END { print "</Slides>"}
' commandurl=$commandurl