#!/usr/bin/perl
# mirror.openwebmail.pl - mirror openwebmail with wget.
#
# 2004/07/15 tung@turtle.ee.ncku.edu.tw
#
# This script uses wget to do mirror for web url and
# deletes files that don't exist on remote site
# The stale files were found by comparing the local filelist with wget log.

use strict;

# The URL to be mirrored, don't forget the tailing /
my $mirror_url="http://turtle.ee.ncku.edu.tw/openwebmail/";

# The remote path that you don't want to mirror, don't forget the tailing /
# eg: my @excludelist= (
#        "/openwebmail/download/old/",
#        "/openwebmail/download/redhat/rpm/daily-build/SRPMS/"
#     );
#
my @excludelist= ();

# The local directory for download
my $downloaddir="/home/ftp/pub/openwebmail/";

# Set $delete_nonexist to 0
# if you don't want this script to delete files not existing on remote site
my $delete_nonexist=1;

# When $delete_nonexist is set to 1,
# local dirs or files in the following list won't be deleted
# eg: my @keeplist= (
#        "/home/ftp/pub/openwebmail/dir1/",
#        "/home/ftp/pub/openwebmail/file1",
#        "/home/ftp/pub/openwebmail/file2"
#     );
#
my @keeplist= ();

# The wget executable
my $wgetbin="/usr/local/bin/wget";

# The wget proxy
my $wgetproxy="";
#my $wgetproxy="http://someproxy.somedomain:3128/";

# The location for log files
my $wgetlog="/var/log/mirror.openwebmail.log";
my $dellog="/var/log/mirror.openwebmail.del.log";

############## No further configuration is required since here ##############

$_=(split(/\s/, $wgetbin))[0];
if (! -x $_) {
   print "Wget program $_ not found!\n";
   exit 1;
}

if ($mirror_url !~ m!^http://!) {
   print "Invalid mirror url $mirror_url!\n";
   exit 2;
}

$downloaddir=~s!/+$!!;
if (! -d "$downloaddir") {
   print "Directory $downloaddir doesn't exist, mirror is canceled\n";
   exit 3;
}


my ($cutdir_number, $remotedir, $excludeparm, @a);
$_=$mirror_url;
s!^http://!!; s!/$!!; @a=split(/\//, $_);
$cutdir_number=$#a;
shift @a;	# remove hostname part from url
$remotedir="/".join("/", @a);
$excludeparm=join(",", @excludelist);
$excludeparm="--exclude-directories=".$excludeparm if ($excludeparm ne "");


for my $i (0..$#excludelist) {
   if ($excludelist[$i]!~s!^$remotedir/!!) {
      print "excludelist member $excludelist[$i] is not under $remotedir\n";
      exit 4;
   }
}

for my $i (0..$#keeplist) {
   $keeplist[$i].='/' if (-d $keeplist[$i] && $keeplist[$i]!~m!/$!);
   if ($keeplist[$i]!~s!^$downloaddir/!!) {
      print "keeplist member $keeplist[$i] is not under $downloaddir\n";
      exit 5;
   }
}

############## most variables have been initialized before here ##############

chdir $downloaddir;

unlink($wgetlog, $dellog);

$ENV{'http_proxy'}=$wgetproxy;
`$wgetbin $excludeparm --cache=off --reject=O=A,D=A,M=A,N=A,S=A,O=D,D=D,M=D,N=D,S=D --cut-dirs=$cutdir_number -m -nH -np -o $wgetlog $mirror_url`;

my (%exist, $exist_count, $error_timeout, $error_noservice);
if ($delete_nonexist) {
   if (!open(WGETLOG, $wgetlog)) {
      print "wget logfile open error ($!)\n";
      exit 4;
   }
   while(<WGETLOG>) {
      if (/=> `(.*)'/) {
         $exist{$1}=1;
         $exist_count++;
      } elsif (/Operation timed out/ || /Host is down/) {
         $error_timeout=1;
      } elsif (/Service Unavailable/) {
         $error_noservice=1;
      }
   }
   close(WGETLOG);
}

open(DELLOG, ">$dellog");

if ($delete_nonexist) {
   if ($exist_count<5) {
      $delete_nonexist=0;
      print DELLOG "### Unknown wget log format? exist_count<5, delete_nonexist canceled.";
   } elsif ($error_timeout) {
      $delete_nonexist=0;
      print DELLOG "### Operation timeout? delete_nonexist canceled.";
   } elsif ($error_noservice) {
      $delete_nonexist=0;
      print DELLOG "### Service unavailable? delete_nonexist canceled.";
   }
}

# del faked index.html and files that don't appear in wgetlog
open(P, "find $downloaddir -type f|");
while(<P>) {
   chomp($_);
   $_=~s!^$downloaddir/!!;
   next if $0=~/$_$/;
   next if $wgetlog=~/$_$/;
   next if $dellog=~/$_$/;

   if ($_=~/index.html\?[CODMNS]=[ADMNS]/) {
      print DELLOG "del $_ (filelist)\n" if (unlink($_));
      next;
   }
   if ($_=~/index.html$/) {
      my $buff;
      open(F, $_);
      read(F, $buff, 512);
      close(F);
      if ($buff=~/\QA HREF="?N=D"\E/ ||
          $buff=~/\QA HREF="?O=D"\E/ ) {
         print DELLOG "del $_ (filelist)\n" if (unlink($_));
         next;
      }
   }

   my $exclude_found=0;
   foreach my $exclude (@excludelist) {
      if ($_=~/^$exclude/) {
         $exclude_found=1; last;
      }
   }
   next if ($exclude_found);

   if ($delete_nonexist && !$exist{$_}) {
      my $keep_found=0;
      foreach my $keep (@keeplist) {
         if ($_=~/^$keep/) {
            $keep_found=1; last;
         }
      }
      print DELLOG "del $_\n" if (!$keep_found && unlink($_));
      next;
   }
}
close(P);

# remove empty dir
open(P, "find $downloaddir -type d|sort -r|");
while (<P>) {
   chomp($_);
   next if ($_ eq $downloaddir);
   my $name_found=0;
   foreach my $name (@excludelist, @keeplist) {
      if ($_=~m!^$downloaddir/$name!) {
         $name_found=1; last;
      }
   }
   next if ($name_found);
   print DELLOG "rmdir $_ (empty dir)\n" if (rmdir($_));
}
close(P);

close(DELLOG);
