#!/usr/local/bin/perl
#
# $Header: /u/wjm/tmp/RegEx/src/NewsScan,v 1.0 93/08/21 12:00:09 wjm Exp 
# RegEx NewsScan - version 1.0
#$Log:	NewsScan,v $
# Revision 1.2  93/08/27  02:16:22  02:16:22  wjm (Bill Middleton)
# fixed a bug in the temporary storage, and
# also made hits mailable with .forward in
# subscriber directory.
# 
# 
# copyright (c) Bill Middleton 1993
#
#  Configure
#
$waitarea="/news/search.pl/waitingarea";        # batches linked into this dir
$subscribe_dir = '/news/search.pl/subscribers'; # subscriber directory
$hits='/news/search.pl/bin/hits';               # hitfile (saves memory)
$how_we_mail="/usr/local/bin/mush -s \" RegEx Automated hits delivery\"";
$pidfile="/news/search.pl/bin/pid";
$MAXLOAD=3;
$DEBUG=0;
#
# End Configure
#
# This pmeter stuff is taken right out of clip.

(-f $pidfile) && die "Pidfile exists\n";
$pid = $$;
open(PID,">$pidfile");
print PID "$pid";
close(PID);
$pmeter = fork;
defined $pmeter || die "can't fork: $!";
if ($pmeter == 0) {
    &pmeter($MAXLOAD, $pid, $DEBUG & 2);
    die "Not reached";
}


opendir(WAIT,$waitarea);
@batches=grep(/^7|0/,readdir(WAIT));
closedir(WAIT);
open(HITLIST,">$hits")||die "cant open hits";
select(HITLIST); $|=1; select(STDOUT);

&get_hits;                                       # get user keywords
T1: foreach $batch (@batches){                       # process each batch
     open(BATCH,"<$waitarea/$batch");
     $i=0;
     $fp=0;
     if(($batch =~ /^7/)||($batch=~/\.Z\.t/)){ 
      while(<BATCH>){
         if(/^#\!\s+(\S+)\s+(\d+)\s+(\S+)/){        # rnews or cunbatch
           ($command,$size,$site) = ($1,$2,$3);
           ++$i;$fp=$i;
         }else{
           next T1; 
         }
         $artlength=0;
         $art='';
         while($artlength < $size){
           $line = <BATCH>;
           $artlength+=length($line);
           $art.=$line;
         }
         &process_article($art);
      }
    }elsif($batch=~/^0\./){         # processing single article
      $art='';
      while(<BATCH>){
        $art.=$_;
      }
      &process_article($art);
    }
close BATCH;
unlink "$waitarea/$batch"; 
}
close HITLIST;
&process_hitlist();
unlink "hits"; 
unlink $pidfile;
############################################################################
sub process_hitlist{
local($who,$line);
local(%hits2);
open(HITLIST,"<$hits");
while (<HITLIST>){
  ($who,$line) = split(/\t/,$_,2);
  $hits2{$who} .= $line;
}

while(($who,$line) = each %hits2){           # write on each users .hits file
  next unless ((-s"$subscribe_dir/$who/.newsearchrc") > 0); 
  if(-f "$subscribe_dir/$who/.forward"){
    $address=`cat $subscribe_dir/$who/.forward`;
    open(HITS,"| $how_we_mail $address");
  }else{
    open(HITS,">> $subscribe_dir/$who/.hits");
  }
  print HITS $line;
  close HITS;
}
close HITLIST;
}
############################################################################
sub process_article{
local($header,$body)=split(/\n\s*\n/,$_[0],2);      # split out the header
$header.="\n";
local($whole,$id,$subj,$ngps,$from,$hit,$val);        
$header=~s/\t/   /g;
($header=~/Message-ID:\s+(.*)[\n]{1,1}/) && ($id = $1);
($header=~/Subject:\s+(.*)[\n]{1,1}/) && ($subj=$1);
($header=~/Newsgroups:\s+(.*)[\n]{1,1}/) && ($ngps=$1);
($header=~/From:\s+(.*)[\n]{1,1}/) && ($from=$1);
($ngps=~/binaries/) ? ( $whole=$header) : ($whole= $header.$body);
while (($hit,$val) = each %hits){
  @mult=split(/\s+and\s+/,$hit);         
  $flag=0;
  foreach $exp (@mult){                             # for each 'and' exp
    $reverse=0;
    ($exp =~ s/^not\s+//) && ($reverse=1);
    if ($whole =~ /$exp/i){                           # case insensitive
      if(! $reverse){
        $flag=1; next;
      }
      $flag=0;last;
    }else{
      if($reverse){
        $flag=1; next;
      }else{
        $flag=0; last;
      }
    }
  }
  if($flag){
    @who=split('\n',$val);
    for (@who){
      print HITLIST "$_\t$id\t$subj\t$ngps\t$hit\t$from\n";
    }
  }
}
}

############################################################################
sub get_hits{
opendir(SUBS,$subscribe_dir);
local(@subscribers)=grep(!/^\.\.?$/,readdir(SUBS));
local($i);
closedir(SUBS);
foreach $sub (@subscribers){
    $i=0;
    next unless ((-s"$subscribe_dir/$sub/.newsearchrc") > 0); 
    open(SEARCHRC,"$subscribe_dir/$sub/.newsearchrc") || next; 
    while(<SEARCHRC>) {
        last if ($i++ > 10);
        chop;
        next if /^#/;
        next if /^\s*$/;
	$hits{$_} .= "$sub\n";
    }
    close(SEARCHRC);
}
}

sub pmeter {
    # ($loadavg, $pid, $debug) = @ARGV;
    local($loadavg, $pid, $debug) = @_;
    $running = 1;
    $0 = "pmeter @_";

    while (kill 0, $pid) {
	`uptime` =~ /load average:\s+([\d.]+)/
	    || die "Can't run uptime: $!\n";;

	if ($1 > $loadavg) {
	    kill 'STOP', $pid;
	    if ($running) {
		print STDERR "stopping at $1\r\n" if $debug;
		$0 = "pmeter (stopped $pid at $loadavg)";
		$running = 0;
	    }
	}
	else {
	    kill 'CONT', $pid;
	    if (!$running) {
		print STDERR "starting at $1\r\n" if $debug;
		$0 = "pmeter (started $pid at $loadavg)";
		$running = 1;
	    }
	}
	sleep 90;
    }
}
