#!/usr/bin/perl -w
use strict 'refs';
use CGI qw(:standard);
use CGI::Carp qw(warningsToBrowser fatalsToBrowser);

$|++;

$CGI::DISABLE_UPLOADS = 0;

my $ROOT = '/home/hal/public_html/WhatToSee';

my $DATE = getdate();

print header;
my $JAVASCRIPT=<<ENDJAVA;
function selectAll(formObj) 
{
   for (var i=0;i < formObj.length;i++) 
   {
      fldObj = formObj.elements[i];
      if (fldObj.type == 'checkbox')
      { 
         fldObj.checked = true; 
       }
   }
}
function selectNone(formObj) 
{
   for (var i=0;i < formObj.length;i++) 
   {
      fldObj = formObj.elements[i];
      if (fldObj.type == 'checkbox')
      { 
         fldObj.checked = false; 
       }
   }
}
ENDJAVA
print start_html(-title=>"WhatToSee", -script=>$JAVASCRIPT);

my $task = param('task');

print "<h2>WhatToSee</h2>\n";

if ((not defined $task) || ($task eq '')) {
print<<ENDBEGIN;

I have a routine problem that sometimes paper titles are not enough to tell me what papers to read in recent conferences, and I often do not have time to read abstracts fully.  This collection of scripts is designed to help alleviate the problem.  Essentially, what it will do is compare what papers <i>you</i> like to cite with what new papers are citing.  High overlap means the paper is probably relevant to you.  Sure there are counter-examples, but overall I have found it useful (eg., it has suggested papers to me that are interesting that I would otherwise have missed).  Of course, you should also read through titles since that is a somewhat orthogonal source of information.<p/>

Here is how to use the system.  You upload your personal bibtex file and have the system compare it to a known conference index; it will then present a list of papers, sorted by relevance.  If you want to compare to a conference that is not yet indexed, you need to request that indexing take place.  This takes about 30 seconds per paper, so you will probably have to be patient.</p>
ENDBEGIN


print "<h2>Compare to Known Index</h2>\n";
print start_multipart_form();
print "Path to your bibfile: ";
print hidden('task', 'compare');
print filefield('bibfile','',50);

print "<br/>Select indices:<br/>";

open L, "ls $ROOT/index/* |" or die;
my %urls = ();
while (my $indexfile = <L>) {
    chomp $indexfile;
    open F, $indexfile or die "cannot read index $indexfile ($!)";
    my $url = '';
    my $name = '';
    my $numlines = 0;
    while (<F>) { 
        chomp; 
        if ($numlines == 0) { $url = $_; }
        if ($numlines == 1) { $name = $_; }
        $numlines++;
    }
    if (($url eq '') || ($numlines < 10)) { next; }

    if ($name =~ /^([^\s]+)[\s]+(.+)$/) {
        my $conf = $1; my $year = $2;
        $urls{$conf}{$year}{URL} = $url;
        $urls{$conf}{$year}{IDX} = $indexfile;
    } else {
        $urls{$name}{''}{URL} = $url;
        $urls{$name}{''}{IDX} = $indexfile;
    }
}

print "<table border=0 cellpadding=0 cellspacing=0>\n";
foreach my $conf (sort keys %urls) {
#    print "&nbsp;&nbsp;&nbsp;&nbsp;";
    print "<tr>";
    my @years = keys %{$urls{$conf}};
#    if (@years == 1) {
#        my $url = $urls{$conf}{$years[0]}{URL};
#        my $idx = $urls{$conf}{$years[0]}{IDX};
#        print "<input type=checkbox name=index value=\"$idx\"><a href=\"$url\">$conf $years[0]</a> <font size=-1>(<a href=\"?task=viewindex&index=$idx\">index</a>)</font><br/>\n";
#    } else {
        print "<td>&nbsp;&nbsp;&nbsp;&nbsp;</td><td>$conf:</td>";
        foreach my $year (sort @years) {
            my $url = $urls{$conf}{$year}{URL};
            my $idx = $urls{$conf}{$year}{IDX}; 
            my $idxF= $idx; $idxF =~ s/^$ROOT\/index\///;
            print "<td><input type=checkbox name=index value=\"$idx\"><a href=\"$url\">$year</a> <font size=-1>(<a href=\"?task=viewindex&index=$idxF\">i</a>)</font></td>";
        }
        print "</tr>\n";
#    }
}
print "</table>\n";

#foreach my $url (sort { $urls{$a}{NAME} cmp $urls{$b}{NAME} } keys %urls) {
#    my $idx = $urls{$url}{INDEX}; $idx =~ s/^$ROOT\/index\///;
#    print "&nbsp;&nbsp;&nbsp;&nbsp;<input type=checkbox name=index value=\"$urls{$url}{INDEX}\"><a href=\"$url\">$urls{$url}{NAME}</a> <font size=-1>(view <a href=\"?task=viewindex&index=$idx\">index</a>)</font><br/>\n";
#}
close L;

print "&nbsp;&nbsp;&nbsp;&nbsp;<button name=selectall type=button onclick=\"selectAll(this.form)\">Select all</button> <button name=selectnone type=button onclick=\"selectNone(this.form)\">Select none</button><br/>\n";
print "Number of results per index: " . popup_menu('numres', ['10','25','50','all']) . "<br/>\n";
print submit('Submit');

print endform;

print "<h2>Submit a New Index</h2>\n";
print start_multipart_form();
print hidden('task', 'newindex');
print "URL for index: ";
print textfield('indexurl','',50);
print "<br/>\n";
print "Name for index: ";
print textfield('indexname','',50);
print "<br/>\n";
print submit('Submit');
print endform;

print "<hr>Email bugs, comments, suggestions, etc to <a href=\"mailto:me AT hal3 DOT name\"><img border=0 valign=bottom src=\"http://hal3.name/email.png\" ALT=\"me AT hal3 DOT name\"></a><br/>\n";

} elsif ($task eq 'viewindex') {
    if (my $indexfile = param('index')) {
        $indexfile = "$ROOT/index/$indexfile";

        open F, $indexfile or die;
        my $url = <F>; chomp $url;
        my $title = <F>; chomp $title;
        print "<h3>Index of <a href=\"$url\">$title</a></h3>\n";
        print "<ul>\n";
        my %all = ();
        while (<F>) {
            chomp;
            my ($url, $num, $title, $words) = split /\t/, $_;
            $all{$title}{$url} = 1;
        }
        close F or die;
        foreach my $title (sort keys %all) {
            foreach my $url (sort keys %{$all{$title}}) {
                print "<li><a href=\"$url\">$title</a></li>\n";
            }
        }
        print "</ul>\n";
    }
} elsif ($task eq 'schedindex') {
    if (my $indexfile = param('index')) {
        $indexfile = "$ROOT/index/$indexfile";

        my $numTracks = param('numTracks') || 4;
        my $perSession = param('perSession') || 5;
        my $lambda = param('lambda') || 0.9;

        print "Please wait, scheduling...\n";

        open F, "./schedule.pl $numTracks $perSession $lambda < $indexfile |" or die;
        print "<pre>\n";
        while (<F>) {
            print;
        }
        print "</pre>\n";
    }
} elsif ($task eq 'compare') {
    if ((my $file = param('bibfile')) && (my @indexfiles = param('index')) && (my $numRes = param('numres'))) {
        my $tmpfile = tmpFileName($file);
        my $mimetype = uploadInfo($file)->{'Content-Type'} || '';

        foreach my $indexfile (@indexfiles) {
            open I, $indexfile or die;
            my $url = <I>; chomp $url;
            my $name = <I>; chomp $name;
            close I or die;

            print "<h3>Comparing <i>$file</i> with index of <a href=\"$url\">$name</a></h3>\n";

            my $addr = $ENV{'REMOTE_ADDR'} || 'UNKaddr';
            my $outFile = "$ROOT/bibs/$DATE.$addr.bz2";
            if (not -e $outFile) {
                open F, "| bzip2 -9 > $outFile" or die;
                open I, $tmpfile or die;
                while (<I>) { print F $_; }
                close I or die;
                close F;
            }

            open F, "$ROOT/compare_bib_and_index.pl $tmpfile $indexfile |" or die;
            print "<table>\n";
            print "<tr><td><b>Score</b></td><td><b>Paper</b></td></tr>\n";
            my $n = 0;
            my %hit = ();
            while (<F>) {
                chomp;
                my ($sim, $title, $pdf) = split /\t/, $_;
                if (exists $hit{$pdf}) { next; }
                $hit{$pdf} = 1;
                $sim = int($sim * 1000) / 10;
                print "<tr><td>$sim</td><td><a href=\"$pdf\">$title</a></td></tr>\n";
                if (++$n >= $numRes) { last; }
            }
            print "</table>\n";
            close F;
            print "<p/>\n";
        }
    } else {
        print "Error:<br/>\n";
        if (not defined param('bibfile')) { print "Error: bibfile not uploaded<br/>\n"; }
        if (not defined param('index')) { print "Error: index not specified<br/>\n"; }
    }
} elsif ($task eq 'newindex') {
    if ((my $url = param('indexurl')) && (my $name = param('indexname'))) {
        if ($url =~ /^http/) {
            print "Computing new index for $name ($url)...plese be patient!<p/>\n";

            indexit($url,$name);

            print "<p/><b>Done!</b>  <a href=\"\">Back</a><br/>";
        }
    }
}



print end_html;



sub indexit {
    my ($url,$name) = @_;

    my $urlNoTrail = $url;
    $urlNoTrail =~ s/\/[^\/]*$//; $urlNoTrail .= '/';

    my $urlHead = $url;
    if ($url =~ /^(.*):\/\/([^\/]+)/) {
        $urlHead = $1 . "://" . $2 . '/';
    }


    my $indexname = $url;
    $indexname =~ s/^.*:\/\///;
    $indexname =~ s/\//___/g;
    $indexname =~ s/[^A-Za-z0-9\.]/_/g;

#    if (-e "$ROOT/index/$indexname") { }

    `touch $ROOT/index/$indexname`;
    `mkdir $ROOT/tmp/$indexname`;

    print "&nbsp;&nbsp;&nbsp;&nbsp;Downloading index...";

    my %pdfs = ();
    my $txt = '';
    open F, "wget -q -U Mozilla -O - '$url' | ";
    while (<F>) { $txt .= $_; }
    close F;

    open O, "> $ROOT/.tmp" or die "cannot write to $ROOT/.tmp ($!)";
    print O $txt;
    close O;

    my $href = "[hH][rR][eE][fF]";

    my $pdfId = 0;
    while ($txt =~ s/$href=\"([^\"]+.pdf)\"//) { 
        my $f = $1;
        #print "<br/>[$f]\n";
        if ($f =~ /^\//) { $f = $urlHead . $f; }
        elsif (not ($f =~ /:\/\//)) { $f = $urlNoTrail . $f; }
        if (not exists $pdfs{$f}) { $pdfs{$f} = $pdfId++; }
    }
    while ($txt =~ s/$href=([^>]+.pdf)//) { 
        my $f = $1;
        if ($f =~ /^\//) { $f = $urlHead . $f; }
        elsif (not ($f =~ /:\/\//)) { $f = $urlNoTrail . $f; }
        if (not exists $pdfs{$f}) { $pdfs{$f} = $pdfId++; }
    }
    while ($txt =~ s/$href=\"(http:\/\/doi.acm.org\/[0-9\.\/]+)\">//) {
        if (not exists $pdfs{$1}) { $pdfs{$1} = $pdfId++; }
    }
    while ($txt =~ s/$href=(http:\/\/doi.acm.org\/[0-9\.\/]+)>//) {
        if (not exists $pdfs{$1}) { $pdfs{$1} = $pdfId++; }
    }
    while ($txt =~ s/$href=\"(http:\/\/dx.doi.org\/[^\"]+)\">//) {
        if (not exists $pdfs{$1}) { $pdfs{$1} = $pdfId++; }
    }

    print '' . (scalar keys %pdfs) . " total PDFs...<br/>\n";

    print "&nbsp;&nbsp;&nbsp;&nbsp;Downloading PDFs...";
    foreach my $f (keys %pdfs) {
        print ".";
        my $o = "$ROOT/tmp/" . $indexname . '/' . $pdfs{$f};
        if ((not -e $o) && (not -e "$o.txt")) {
            if ($f =~ /^http:\/\/doi.acm.org/) { getACM($f,$o); }
            elsif ($f =~ /^http:\/\/dx.doi.org/) { getDOI($f,$o); }
            else { `wget -q -U Mozilla -O $o '$f'`; }
        }
    }
    print "<br/>\n";

    print "&nbsp;&nbsp;&nbsp;&nbsp;Converting to text...";
    foreach my $f (keys %pdfs) {
        print ".";
        my $o = "$ROOT/tmp/" . $indexname . '/' . $pdfs{$f};
        if (not -e $o) { next; }
        if (-e "$o.txt") { next; }
        #print "<br/>[$f / $pdfs{$f}]\n";
        `$ROOT/pdftotext $o $o.txt`;
        #`rm $o`;
    }
    print "<br/>\n";

    open I, "> $ROOT/index/$indexname" or die "cannot open $ROOT/index/$indexname for writing ($!)";

    print I $url . "\n";
    print I $name . "\n";

    print "&nbsp;&nbsp;&nbsp;&nbsp;Extracting reference text...";
    foreach my $f (keys %pdfs) {
        print ".";
        my $txt = "$ROOT/tmp/" . $indexname . '/' . $pdfs{$f} . '.txt';
        if (not -e $txt) { next; }
        my $pdf = "$ROOT/tmp/" . $indexname . '/' . $pdfs{$f};
        my %words = ();
        #print "<br/>[$f / $pdfs{$f}]\n";
        open F, "cat $txt | $ROOT/getreftext.pl | /home/hal/bin/preprocess.perl -q 2>&1 |" or next;
        #open F, "$ROOT/pdftotext $pdf - | $ROOT/getreftext.pl | /home/hal/bin/preprocess.perl -q |" or next;
        while (<F>) {
            #print "[$_]<br/>";
            chomp;
            $_ = lc($_);
            s/[^A-Za-z ]/ /g;
            my @w = split /[\s]+/, $_;
            foreach my $w (@w) { 
                if (length($w) > 3) { $words{$w}++; }
            }
        }
        close F;

        my $title = '';
        open F, $txt or next;
        $title = <F>; chomp $title;
        if ($title =~ /^Journal of Machine Learning Research/) {
            $title = <F>; $title = <F>; $title = <F>; $title = <F>; chomp $title;
        }
        close F;
        $title =~ s/[\s]+/ /g;

        if (scalar keys %words < 10) { next; }

        print I $f . "\t" . $pdfs{$f} . "\t" . $title . "\t";
        my $first = 1;
        foreach my $w (sort keys %words) {
            if (not $first) { print I ' '; }
            print I $w;
            $first = 0;
        }
        print I "\n";
    }
    close I or die;

    print "<br/>\n";
}


sub getdate {
    my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time);
    return '' . ($year+1900) . '-' . ($mon+1) . "-$mday-$hour-$min-$sec";
}

sub getACM {
    my ($doi, $o) = @_;

    open DOI, "wget -q -U Mozilla -O - '$doi' |" or die;
    my $url = '';
    while (<DOI>) {
        if (/<A NAME=\"FullText\" HREF=\"(ft_gateway.cfm?.+)\"/i) {
            $url = $1;
            last;
        }
    }
    if ($url ne '') { 
        `wget -q -U Mozilla --referer='$doi' -O $o 'http://portal.acm.org/$url'`;
    }
}

sub getDOI {
    my ($doi,$o) = @_;
    my $txt = '';
    open DOI, "wget -q -U Mozilla -O - '$doi' |" or die;
    while (<DOI>) {
        $txt .= $_;
    }
    close DOI;
    
    my $url = '';
    while ($txt =~ s/<a href=\"([^\"]+)\">(.*?)<\/a>//) {
        my $u = $1; my $l = $2;
        if (($l =~ /Print PDF/) && ($u =~ /reprint/)) {
            $url = $1;
            last;
        }
    }
    if ($url eq '') { return; }

    if ($url =~ /^\//) {
        $doi =~ m/^http:\/\/([^\/]+)/;
        $url = $1 . $url;
    }
    `wget -q -U Mozilla --referer='$doi' -O $o '$url'`;
}
