# use perl                                  -*- mode: Perl; -*-
eval 'exec perl -S $0 "$@"'
  if $running_under_some_shell;

use vars qw($running_under_some_shell);         # no whining!

# Daily Update, Version 4.4

# DailyUpdate grabs dynamic information from the internet and integrates it
# into your webpage. Features:
# - Timeouts to handle dead servers without hanging the script (Script timer
#   doesn't work on Windows platforms since they don't have fork().)
# - User-defined update times
# - Modular design allows easy extension to new data sources
# - Compatible with cgi-wrap (if you want to call it on-the-fly)

# For documentation, go to http://www.cs.virginia.edu/~dwc3q/code/update.html.
# For examples of more data acquisition schemas, go to
# http://www.cs.virginia.edu/~dwc3q/code/schemas.html

# This code uses LWP::UserAgent from the libwww library, URI::URL, and
# HTML::Parser, all of which are available on CPAN at
# %CPAN%/modules/by-module/LWP/. (Go to http://www.perl.com/ if you don't know
# how to get to CPAN.)

# I suggest using wwwis by Alex Knowles if your file contains a lot of images,
# which can slow down viewing time on browsers. His script, at
# http://www.tardis.ed.ac.uk/~ark/wwwis/, will determine image sizes and
# insert them into the HTML.

# To use, change the $inHtml and $outHtml variables in the configuration, and
# any other settings you want. Then just run DailyUpdate.  Since the example
# template isn't too exciting, you may want to take your normal homepage and
# jazz it up with the new tags.

# Later, you might want to customize or extend the data gathered by this
# script. See the "tagToHandler" structure in the configuration.

################################################################################

# If you would like to be notified of updates, send email to me at
# coppit@cs.virginia.edu. The latest version is always at
# http://www.cs.virginia.edu/~dwc3q/code/.

# Written by: David Coppit  http://www.cs.virginia.edu/~dwc3q/index.html
#                          <coppit@cs.virginia.edu>
# Early versions also hacked on by: Bob Anzlovar <bob.anzlovar@ssds.com>,
# Christian Blair <webguy@con2.com>, Alvaro Herrera <alvherre@enlaces.c5.cl>,
# David Engler <dengler@mindspring.net>

# Please send me any modifications you make. Keep in mind that I'm likely to
# turn down obscure features to avoid including everything but the kitchen
# sink.

# This code is distributed under the GNU General Public License (GPL). See
# http://www.opensource.org/gpl-license.html and http://www.opensource.org/.

# Version History (major changes only)

# 4.4 Separated configuration from main script (yea!). Fixed "uninitialized
#     variable" warnings. Created user-submitted schemas webpage at
#     http://www.cs.virginia.edu/~dwc3q/code/schemas.html. Added more data
#     acquisition schemas.
# 4.3 Added Infoworld, changed date & time functions to take any valid
#     strftime format string, changed CoolSite to use their logo, changed to
#     only dump to screen if called as a cgi script, and added url attribute
#     to weather (all thanks to jbeimler@co-op.com). Added default values to
#     date, time, and OutputListOrColumns. Made NWS url a mandatory attribute
#     to weather.  Added Adam@Home comic.
# 4.2 Added Linux Today.  Modified GetText & GetLinks to take "^" and "\$"
#     signifying start and end of file. Added proxy support. (Thanks to Meng
#     Kuan <mkchew@mas.gov.sg>) Fork is now disabled on Windows platforms.
#     Added time tag (thanks to Nils Jeppe <nils@providence.work.de>). Moved
#     from HTML::Parse to HTML::Parser.
# 4.1 Added User Friendly and Freshmeat. Fixed slashdot to take a style.
#     Modified GetText to take "^" and "\$" signifying start and end of file.
#     Changed the script to use LWP's get instead of my GetUrl. (reuse!)
# 4   This is a major rewrite of DailyUpdate. I've been meaning to do this for
#     a while, but have had few users, and therefore very little external
#     pressure to do it.

################################################################################

require 5.001;
use strict;
# Used to parse the template file
use HTML::Parser;
# Used to make relative URLs absolute
use URI::URL;

package Main;

# Debug mode: doesn't put a time limit on the script, outputs some
# <!--DEBUG:...--> commentary, and doesn't write to the output file (instead
# it dumps to screen).
use constant DEBUG => 0;

require "DailyUpdate_config";

use vars qw( *OLDOUT $proxy $inHtml $outHtml $scriptTimeout );

if (DEBUG)
{
  $inHtml = "template.html";
  $outHtml = "daily.html";
}

# Need to derive a parser class from HTML::Parser
package dailyUpdateParser;

use vars qw( @ISA $socketTimeout $proxy %tagToHandler );
@ISA = qw(HTML::Parser);

# For UserAgent
use LWP::UserAgent;
# For timelocal
use Time::Local;
# For easy date/time manipulation
use POSIX;

# DEBUG for this package is the same as the main.
use constant DEBUG => Main::DEBUG;

# ------------------------------------------------------------------------------

# Checks the old html file to see if we need to change the information.  If
# the file doesn't exist, the time tag can't be found, the information is old,
# or the information couldn't be gotten on the last attempt, the function
# returns 1.  Otherwise, it outputs the old information and returns 0.
# Parameters: (Comment string, update times)
sub CachedDataUsable
{
my ($tagName, @updateTimes) = @_;

print "<!--DEBUG: CachedDataUsable: checking $tagName-->\n" if (DEBUG);

my $currentTime = time;
my ($hour,$day,$month,$year,$wday) = (localtime($currentTime))[2..6];

# If there's no old data file, we have to generate the data.
print "<!--DEBUG:  Old output file doesn't exist.-->\n" if (DEBUG && !-e $Main::outHtml);
return 0 if (!-e $Main::outHtml);

open (OLDHTML,$Main::outHtml) or die "Can't open output file!\n";
my $oldHtml = join "",<OLDHTML>;
close (OLDHTML);

my $lastUpdated;

# Need to get data if we can't find the embedded timestamp
if ($oldHtml !~ /<!-- $tagName (\d+) -->/)
{
  print "<!--DEBUG:  Can't find embedded timestamp for $tagName.-->\n" if (DEBUG);
  return 0;
}
else
{
  $lastUpdated = $1;
}

# Need to get the data if we were unsuccessful last time
if ($oldHtml =~ /<!-- $tagName (\d+) -->\nCouldn't get/)
{
  print "<!--DEBUG:  Data was not grabbed successfully last time.-->\n" if (DEBUG);
  return 0;
}

print "<!--DEBUG:  Last updated:$lastUpdated Current time:$currentTime.-->\n" if (DEBUG);
# Need to get the data if the old data is stale
my $needToUpdate = 0;
foreach my $updateHour (@updateTimes)
{
  my $updateTime = timelocal(0,0,$updateHour,$day,$month,$year);
  if (($lastUpdated < $updateTime) && ($updateTime < $currentTime))
  {
    print "<!--DEBUG:  Data is stale for time: $updateTime.-->\n" if (DEBUG);
    return 0;
    last;
  }
}

# Otherwise, just print out the old data
print "<!--DEBUG: Re-using current data.-->\n" if (DEBUG);
my ($oldData) = $oldHtml =~ /(<!-- $tagName \d+ -->\n.*?<!-- $tagName \d+ -->)/s;
print "$oldData\n";

return 1;
}

# ------------------------------------------------------------------------------

# Extracts any normal text from the html at url, between startPattern and
# endPattern. Removes beginning and ending newlines, and collapses multiple
# blank lines between paragraphs into a single blank line.

# This function returns a list to be compatible with GetLinks in the context
# of HandleGeneric. startPattern and endPattern can be "^" or "\$" to match
# the beginning of the file or the end of the file.
sub GetText
{
my ($url,$startPattern,$endPattern) = @_;

my $userAgent = new LWP::UserAgent;
$userAgent->timeout($socketTimeout);
$userAgent->proxy(['http', 'ftp'], $proxy) if $proxy ne "";
my $request = new HTTP::Request GET => "$url";
my $html = $userAgent->request($request)->content;

return () if ((!defined $html) || ($html eq ""));

# Strip off all the stuff before and after the start and end patterns
$html =~ s/.*?$startPattern(.*?)$endPattern.*/$1/s
  if (($startPattern ne "^") && ($endPattern ne "\$"));
$html =~ s/.*?$startPattern(.*)/$1/s
  if (($startPattern ne "^") && ($endPattern eq "\$"));
$html =~ s/(.*?)$endPattern.*/$1/s
  if (($startPattern eq "^") && ($endPattern ne "\$"));

# Remove all tags
$html =~ s/<.*?>//gs;

# Remove all fragments of tags at the beginning
$html =~ s/^.*>//s;

# Remove all fragments of tags at the end
$html =~ s/<.*//s;

# Remove starting and ending newlines
$html =~ s/^\n*(.*?)\n*$/$1/s;

# Remove extra newlines between paragraphs
$html =~ s/\n\n\n/\n\n/sg;

return ($html);
}

#-------------------------------------------------------------------------------

# Extracts all <a href...>...</a> links at a given url between startPattern
# and endPattern. Removes all text formatting, and makes relative links
# absolute. Puts quotes around attribute values in stuff like <a href=blah> and
# <img src=blah>.

# Now handles "^" and "\$" to signify start and end of file.
sub GetLinks
{
my ($url,$startPattern,$endPattern) = @_;

my $userAgent = new LWP::UserAgent;
$userAgent->timeout($socketTimeout);
$userAgent->proxy(['http', 'ftp'], $proxy) if $proxy ne "";
my $request = new HTTP::Request GET => "$url";
my $html = $userAgent->request($request)->content;

return () if ((!defined $html) || ($html eq ""));

# Strip off all the stuff before and after the start and end patterns
$html =~ s/.*?$startPattern(.*?)$endPattern.*/$1/s
  if (($startPattern ne "^") && ($endPattern ne "\$"));
$html =~ s/.*?$startPattern(.*)/$1/s
  if (($startPattern ne "^") && ($endPattern eq "\$"));
$html =~ s/(.*?)$endPattern.*/$1/s
  if (($startPattern eq "^") && ($endPattern ne "\$"));

# Strip off all the stuff before and after the start and end patterns
$html =~ s/.*?$startPattern(.*?)$endPattern.*/$1/s;

# Figure out how to make relative links absolute.
my $baseurl;
if ($url =~ m#(http://.*?)/# || $url =~ m#(http://.*?)#)
{
  $baseurl = $1;
}

$baseurl = "$baseurl/" if $baseurl !~ m#/$#;

my @links;

# See if there's a link on this line
while ($html =~ /(<a href.*?>.*?<\/a>)/sgci)
{
  my $link = $1;

  # Remove any formatting
  $link =~ s/< *\/?(font|li|b).*?>//sig;

  # Put quotes around "a href" and "src" if they don't have it.
  $link =~ s/<((?:a href|src)[^>=]+?=)((?!")[^\s>]+)(?!")([^>]*)>/<$1"$2"$3>"/gi;

  # change relative tags to absolute
  $link =~ s/(<[^=]+?=")([^"]+)("[^>]*>)/sprintf("$1%s$3",URI::URL->new($2,$baseurl)->abs)/egi;

  push @links,$link;
}

return @links;
}

#-------------------------------------------------------------------------------

# Formats the items in the argument array as an unformatted list
sub PrintUnorderedList
{
  print "<ul>\n";
  while (my $item = shift)
  {
    print "  <li> $item\n";
  }
  print "</ul>\n";
}

#-------------------------------------------------------------------------------

# Formats the items in the argument array as two columns
sub PrintTwoColumns
{
  my @items = @_;

  print "<table width=100%>\n";

  my $secondColumn = $#items;

  print ("<tr>\n  <td width=50% valign=top>\n");


  for (my $index=0;$index < int($#items/2)+1;$index++)
  {
    print ("    $items[$index]<br>\n");
  }

  print "  </td>\n  <td valign=top>\n";

  for (my $index=int($#items/2)+1;$index <= $#items;$index++)
  {
    print ("    $items[$index]<br>\n");
  }

  print ("  </td>\n</tr>\n");
  print "</table>\n";
}

#-------------------------------------------------------------------------------

# Prints the data as an unordered list or two equal length columns, depending
# on $attributes->{style}
sub OutputListOrColumns
{
my $attributes = shift;
my $tagName = shift;
my @data = @_;

# Unordered list is the default
if ((!defined $attributes->{style})
    || ($attributes->{style} =~ /^unorderedlist$/i))
{
  &PrintUnorderedList(@data);
}
elsif ($attributes->{style} =~ /^twocolumn$/i)
{
  &PrintTwoColumns(@data);
}
else
{
  print "Warning: Unknown style for $tagName.\n";
}
}

#-------------------------------------------------------------------------------

# This is a generic handler for most data sources.

# The parameters are:
# - the hours of the day at which to update the data
# - the url from which to get the data
# - the name of the data (used in error messages and output comments)
# - the function to use to get the data (GetLinks and GetText are good
#   examples so far. See their comments for more info.)
# - the code to use to filter the acquired data
# - the code to use to output the data
sub HandleGeneric
{
my $attributes = shift;
my $updateTimes = shift;
my $url = shift;
my $tagName = shift;
my $getFunction = shift;
my $filterCode = shift;
my $outputCode = shift;

return if &CachedDataUsable($tagName,@$updateTimes);

my $time = time;
print "<!-- $tagName ".$time." -->\n";

# Get the data
my @grabbedData = eval $getFunction;
print "<!--DEBUG: Acquired ".($#grabbedData+1)." lines of data.-->\n" if (DEBUG);

# Filter the data
eval $filterCode;

if ($#grabbedData == -1)
{
  # Don't change "Couldn't get". It's used by CachedDataUsable
  print "Couldn't get $tagName.\n";
  print "<!-- $tagName ".$time." -->\n";
  return;
};

# Output the data
eval $outputCode;

print "<!-- $tagName ".$time." -->\n";
return;
}

#-------------------------------------------------------------------------------

# Output the time in the following formats, depending on the parameter
# passed:
# style=twentyfour: 14:17:59
# style=twelve: 2:17:59 PM (default)
sub HandleTime
{
my $attribs = shift;

my $time;

# Twelve hour is the default
if ((!defined $attribs->{style}) || ($attribs->{style} =~ /^twelve$/i))
{
  $time = POSIX::strftime("%l:%M:%S %p", localtime);
}
elsif ($attribs->{style} =~ /^twentyfour$/i)
{
  $time = POSIX::strftime("%k:%M:%S", localtime);
}

# If the time is empty, or contains a % (from the strftime) or if it is the
# exact same as the format string, most likely its not what we wanted
if ((!$time) || ($time =~ m/\%/)
   || ((defined $attribs->{style} && $time =~ m/^($attribs->{style})$/))) 
{
  print "Warning: Unknown style for time.\n";
}
else
{
  print $time;
}

}

#-------------------------------------------------------------------------------

# Output the date in the following formats, depending on the parameter passed:
# style=day: Tuesday, November 7 (default)
# style=numeric: 951107
# style=long: November 7, 1995
# style=Anything else defined by the function strftime (see man strftime 3)

sub HandleDate
{
my $attribs = shift;

my ($date,@datearray);

if ((!defined $attribs->{style}) || ($attribs->{style} =~ /^day$/i))
{
  $date = POSIX::strftime("%A, %B %d", localtime);
}
elsif ($attribs->{style} =~ /^numeric$/i)
{
  $date = POSIX::strftime("%D", localtime);
  @datearray  = split(/\//,$date);
  $date = "$datearray[2]$datearray[0]$datearray[1]";
}
elsif ($attribs->{style} =~ /^long$/i)
{
  $date = POSIX::strftime("%B %d, %Y", localtime);
}
else
{
  $date = POSIX::strftime($attribs->{style}, localtime);
}

# If the date is empty, or contains a % (from the strftime) or if it is the
# exact same as the format string, most likely its not what we wanted.
if ((!$date) || ($date =~ m/\%/)
   || ((defined $attribs->{style}) && ($date =~ m/^($attribs->{style})$/))) 
{
  print "Warning: Unknown style for date.\n";
}
else
{
  print $date;
}
}

# ------------------------------------------------------------------------------

# Prints out the current weather given the right line from the NWS format.
sub FormatCurrentWeather
{
my $currentWeather = shift;

print "<tr>\n <th> Currently </th>\n";

($currentWeather) = $currentWeather =~ /^(  .*)/m;

# Pretty up N/As
$currentWeather =~ s/\bN\/A\b/Not Available/g;

# Print the last part, which should be the skies.
print " <td> \u\L$1 </td>\n" if ($currentWeather =~ /((\w|\w \w)+)$/);

# Print the first part, which should be the temperature.
print " <td> ",$1,"&#176; </td>\n" if ($currentWeather =~ /^ *(\d+)(?! percent)/);

# Print the third part, which should be the winds and rain.
print " <td> \u\L$1</td>\n" if ($currentWeather =~ /^ +\d+   +\S+   +((\w|\w \w)+)  +/);

print "</tr>\n";
}

# ------------------------------------------------------------------------------

# Prints out the weather forecast given the right paragraph from the NWS
# format.
sub FormatWeatherForecast
{
my $weatherText = shift;

# Identify the days
my @days = split(/^ /m, $weatherText);

# Get rid of the bogus empty day
shift @days;

# Get rid of newlines
grep {chomp;s/\n/ /g} @days;

foreach my $day (@days)
{
  print "<tr>\n";

  $day =~ s/^(.*?)\.\.\.//;
  my $time = $1;
  print " <th> \u\L$time </th>\n";

  # Change multiple periods to single ones
  $day =~ s/\.\.*/\./g;

  # Remove spaces before periods
  $day =~ s/\s+\./\./g;

  my @sentences = split(/\.\s*/,$day);

  # Print skies
  print " <td>";
  if ($sentences[0] =~ /(HIGH|LOW).*\d\d/)
  {
    print " &nbsp;";
  }
  else
  {
    while (($#sentences != -1) && ($sentences[0] !~ /(HIGH|LOW).*\d\d/))
    {
      my $sentence = shift @sentences;
      print " \u\L$sentence.";
    }
  }
  print " </td>\n";

  # Print temperatures
  print " <td>";
  if ($sentences[0] !~ /(HIGH|LOW).*\d\d/)
  {
    print " &nbsp;";
  }
  else
  {
    while (($#sentences != -1) && ($sentences[0] =~ /(HIGH|LOW).*\d\d/))
    {
      my $sentence = shift @sentences;
      print " \u\L$sentence.";
    }
  }
  print " </td>\n";

  # Print winds and rain
  print " <td>";
  if ($#sentences == -1)
  {
    print " &nbsp;";
  }
  else
  {
    while ($#sentences != -1)
    {
      my $sentence = shift @sentences;
      print " \u\L$sentence.";
    }
  }
  print " </td>\n</tr>\n";

}
}

# ------------------------------------------------------------------------------

# Gets the weather forecast from the NWS
sub HandleWeather
{
my $attributes = shift;
my $updateTimes = shift;
my $tagName = shift;

if (!$attributes->{url})
{
  print "URL attribute must be defined for weather tag\n";
  return;
}

return if &CachedDataUsable($tagName,@$updateTimes);

my $time = time;
print "<!-- $tagName ".$time." -->\n";

my $userAgent = new LWP::UserAgent;
$userAgent->timeout($socketTimeout);
$userAgent->proxy(['http', 'ftp'], $proxy) if $proxy ne "";
my $request = new HTTP::Request GET => $attributes->{url};
my $html = $userAgent->request($request)->content;

return if ((!defined $html) || ($html eq ""));

my ($currentWeather) = $html =~ /=====\n(.*?)\n/s;
my ($forecast) = $html =~ /\n (\S.*?)\n\s*\n/s;

if (($forecast eq "") || ($currentWeather eq ""))
{
  # Don't change "Couldn't get". It's used by CachedDataUsable
  print "Couldn't get $tagName.\n";
  print "<!-- $tagName ".$time." -->\n";
  return;
};

  # Print the table headers.
  print <<EOT;
<table width=100%>
<tr>
 <th></th>
 <th> Skies </th>
 <th> Temperature </th>
 <th> Winds & Rain </th>
</tr>
EOT

&FormatCurrentWeather($currentWeather);
&FormatWeatherForecast($forecast);
print "</table>\n";

print "<!-- $tagName ".$time." -->\n";
return;
}

# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------

# This is the parser code. Basically, everything gets passed through except
# certain tags, which are caught by the overridden "start" function.

# Basically pass everything through except the special tags.
sub text { print "$_[1]"; }
sub declaration { print "<!$_[1]>"; }
sub comment { print "<!--$_[1]-->"; }
sub end { print "</$_[1]>"; }

sub start
{
  my $self = shift @_;
  my $originalText = pop @_;
  my ($tag, $attributeList) = @_;

  # If we see a special tag, run the proper function for it.
  my $recognizedTag = 0;
  foreach my $jump (keys %tagToHandler)
  {
    if ($tag =~ /^$jump$/i)
    {
      my $handler = ${$tagToHandler{$jump}}[0];
      # "&{$handler}" is a function call to the function reference $handler.
      # The arguments are the attribute list, and the list of data in
      # $tagToHandler after the function. (That is, from array location 1 to
      # the end.)
      &{$handler}($attributeList,@{$tagToHandler{$jump}}[1..$#{$tagToHandler{$jump}}]);
      $recognizedTag = 1;
      last;
    }
  }

  # If it's not a special tag, just print it out.
  if ($recognizedTag == 0)
  {
    print "$originalText";
  }
}

# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------

# Here's the main program

package Main;

# Disable the timers if in debug mode
if ((!DEBUG) && ($^O ne "MSWin32"))
{
  if ((my $scriptpid = fork) != 0) 
  {
    if ((my $sleeppid = fork) != 0)   # Fork off the timer
    {
      # If Dailyupdate finishes before the timer goes off, kill the timer and
      # exit.
      waitpid ($scriptpid,0);
      kill 9,$sleeppid;
    }
    else 
    {
      # If the timer goes off before Dailyupdate finishes, kill Dailyupdate
      # (which will also kill the "Waiter" process) and exit.
      sleep $scriptTimeout;
      kill 9,$scriptpid;
    }
    exit (1);
  }
}

# Make unbuffered for easier debugging.
$| = 1 if (DEBUG);

# Store the old STDOUT so we can replace it later.
open (OLDOUT, ">&STDOUT") if (!DEBUG);

# Redirect STDOUT to a temp file.
open (STDOUT,">".$outHtml.".temp") if (!DEBUG);

# Okay, now do the magic. Parse the input file, calling the handlers whenever
# a special tag is seen.
my $p = new dailyUpdateParser;
$p->parse_file($inHtml);

# Restore STDOUT to the way it was
if (!DEBUG)
{
  close (STDOUT);
  open(STDOUT, ">&OLDOUT") or die "Can't restore STDOUT.\n";

  # Replace the output file with the temp file.
  unlink $outHtml;
  rename ($outHtml.".temp",$outHtml);
  chmod 0755, $outHtml;

  # Check to see if we were invoked as a cgi script
  my $scriptname;
  $scriptname = $ENV{'SCRIPT_NAME'} or $scriptname = '';
  if ($scriptname)
  {
    # Now print the results becuase DailyUpdate was invoked as some sort of cgi.
    print "Content-type: text/html\n\n";

    open (INFILE, $outHtml);
    while (defined(my $line = <INFILE>)) {
      print $line;
    }
    close INFILE;
  }
}
