#!/usr/bin/perl -w use strict 'refs'; use CGI qw(:standard); use CGI::Carp qw(warningsToBrowser fatalsToBrowser); $|++; $CGI::DISABLE_UPLOADS = 0; my $ROOT = '/home/hal/public_html/WhatToSee'; my $DATE = getdate(); print header; my $JAVASCRIPT=<"WhatToSee", -script=>$JAVASCRIPT); my $task = param('task'); print "

WhatToSee

\n"; if ((not defined $task) || ($task eq '')) { print<you like to cite with what new papers are citing. High overlap means the paper is probably relevant to you. Sure there are counter-examples, but overall I have found it useful (eg., it has suggested papers to me that are interesting that I would otherwise have missed). Of course, you should also read through titles since that is a somewhat orthogonal source of information.

Here is how to use the system. You upload your personal bibtex file and have the system compare it to a known conference index; it will then present a list of papers, sorted by relevance. If you want to compare to a conference that is not yet indexed, you need to request that indexing take place. This takes about 30 seconds per paper, so you will probably have to be patient.

ENDBEGIN print "

Compare to Known Index

\n"; print start_multipart_form(); print "Path to your bibfile: "; print hidden('task', 'compare'); print filefield('bibfile','',50); print "
Select indices:
"; open L, "ls $ROOT/index/* |" or die; my %urls = (); while (my $indexfile = ) { chomp $indexfile; open F, $indexfile or die "cannot read index $indexfile ($!)"; my $url = ''; my $name = ''; my $numlines = 0; while () { chomp; if ($numlines == 0) { $url = $_; } if ($numlines == 1) { $name = $_; } $numlines++; } if (($url eq '') || ($numlines < 10)) { next; } if ($name =~ /^([^\s]+)[\s]+(.+)$/) { my $conf = $1; my $year = $2; $urls{$conf}{$year}{URL} = $url; $urls{$conf}{$year}{IDX} = $indexfile; } else { $urls{$name}{''}{URL} = $url; $urls{$name}{''}{IDX} = $indexfile; } } print "\n"; foreach my $conf (sort keys %urls) { # print " "; print ""; my @years = keys %{$urls{$conf}}; # if (@years == 1) { # my $url = $urls{$conf}{$years[0]}{URL}; # my $idx = $urls{$conf}{$years[0]}{IDX}; # print "$conf $years[0](index)
\n"; # } else { print ""; foreach my $year (sort @years) { my $url = $urls{$conf}{$year}{URL}; my $idx = $urls{$conf}{$year}{IDX}; my $idxF= $idx; $idxF =~ s/^$ROOT\/index\///; print ""; } print "\n"; # } } print "

$conf:

$year (i)

\n"; #foreach my $url (sort { $urls{$a}{NAME} cmp $urls{$b}{NAME} } keys %urls) { # my $idx = $urls{$url}{INDEX}; $idx =~ s/^$ROOT\/index\///; # print " $urls{$url}{NAME} (view index)
\n"; #} close L; print "
\n"; print "Number of results per index: " . popup_menu('numres', ['10','25','50','all']) . "
\n"; print submit('Submit'); print endform; print "

Submit a New Index

\n"; print start_multipart_form(); print hidden('task', 'newindex'); print "URL for index: "; print textfield('indexurl','',50); print "
\n"; print "Name for index: "; print textfield('indexname','',50); print "
\n"; print submit('Submit'); print endform; print "

Email bugs, comments, suggestions, etc to $\"me$
\n"; } elsif ($task eq 'viewindex') { if (my $indexfile = param('index')) { $indexfile = "$ROOT/index/$indexfile"; open F, $indexfile or die; my $url = ; chomp $url; my $title = ; chomp $title; print "

Index of $title

\n"; print "

) { chomp; my ($url, $num, $title, $words) = split /\t/, $_; $all{$title}{$url} = 1; } close F or die; foreach my $title (sort keys %all) { foreach my $url (sort keys %{$all{$title}}) { print "

$title

\n"; } } print "

\n"; } } elsif ($task eq 'schedindex') { if (my $indexfile = param('index')) { $indexfile = "$ROOT/index/$indexfile"; my $numTracks = param('numTracks') || 4; my $perSession = param('perSession') || 5; my $lambda = param('lambda') || 0.9; print "Please wait, scheduling...\n"; open F, "./schedule.pl $numTracks $perSession $lambda < $indexfile |" or die; print "

\n";
        while () {
            print;
        }
        print "

\n"; } } elsif ($task eq 'compare') { if ((my $file = param('bibfile')) && (my @indexfiles = param('index')) && (my $numRes = param('numres'))) { my $tmpfile = tmpFileName($file); my $mimetype = uploadInfo($file)->{'Content-Type'} || ''; foreach my $indexfile (@indexfiles) { open I, $indexfile or die; my $url = ; chomp $url; my $name = ; chomp $name; close I or die; print "
Comparing $file with index of $name
\n"; my $addr = $ENV{'REMOTE_ADDR'} || 'UNKaddr'; my $outFile = "$ROOT/bibs/$DATE.$addr.bz2"; if (not -e $outFile) { open F, "| bzip2 -9 > $outFile" or die; open I, $tmpfile or die; while () { print F $_; } close I or die; close F; } open F, "$ROOT/compare_bib_and_index.pl $tmpfile $indexfile |" or die; print "\n"; print "\n"; my $n = 0; my %hit = (); while () { chomp; my ($sim, $title, $pdf) = split /\t/, $_; if (exists $hit{$pdf}) { next; } $hit{$pdf} = 1; $sim = int($sim * 1000) / 10; print "\n"; if (++$n >= $numRes) { last; } } print "
Score Paper
$sim $title
\n"; close F; print "
\n"; } } else { print "Error:
\n"; if (not defined param('bibfile')) { print "Error: bibfile not uploaded
\n"; } if (not defined param('index')) { print "Error: index not specified
\n"; } } } elsif ($task eq 'newindex') { if ((my $url = param('indexurl')) && (my $name = param('indexname'))) { if ($url =~ /^http/) { print "Computing new index for $name ($url)...plese be patient!
\n"; indexit($url,$name); print "
Done! Back
"; } } } print end_html; sub indexit { my ($url,$name) = @_; my $urlNoTrail = $url; $urlNoTrail =~ s/\/[^\/]*$//; $urlNoTrail .= '/'; my $urlHead = $url; if ($url =~ /^(.*):\/\/([^\/]+)/) { $urlHead = $1 . "://" . $2 . '/'; } my $indexname = $url; $indexname =~ s/^.*:\/\///; $indexname =~ s/\//___/g; $indexname =~ s/[^A-Za-z0-9\.]/_/g; # if (-e "$ROOT/index/$indexname") { } `touch $ROOT/index/$indexname`; `mkdir $ROOT/tmp/$indexname`; print "    Downloading index..."; my %pdfs = (); my $txt = ''; open F, "wget -q -U Mozilla -O - '$url' | "; while () { $txt .= $_; } close F; open O, "> $ROOT/.tmp" or die "cannot write to $ROOT/.tmp ($!)"; print O $txt; close O; my $href = "[hH][rR][eE][fF]"; my $pdfId = 0; while ($txt =~ s/$href=\"([^\"]+.pdf)\"//) { my $f = $1; #print "
[$f]\n"; if ($f =~ /^\//) { $f = $urlHead . $f; } elsif (not ($f =~ /:\/\//)) { $f = $urlNoTrail . $f; } if (not exists $pdfs{$f}) { $pdfs{$f} = $pdfId++; } } while ($txt =~ s/$href=([^>]+.pdf)//) { my $f = $1; if ($f =~ /^\//) { $f = $urlHead . $f; } elsif (not ($f =~ /:\/\//)) { $f = $urlNoTrail . $f; } if (not exists $pdfs{$f}) { $pdfs{$f} = $pdfId++; } } while ($txt =~ s/$href=\"(http:\/\/doi.acm.org\/[0-9\.\/]+)\">//) { if (not exists $pdfs{$1}) { $pdfs{$1} = $pdfId++; } } while ($txt =~ s/$href=(http:\/\/doi.acm.org\/[0-9\.\/]+)>//) { if (not exists $pdfs{$1}) { $pdfs{$1} = $pdfId++; } } while ($txt =~ s/$href=\"(http:\/\/dx.doi.org\/[^\"]+)\">//) { if (not exists $pdfs{$1}) { $pdfs{$1} = $pdfId++; } } print '' . (scalar keys %pdfs) . " total PDFs...
\n"; print "    Downloading PDFs..."; foreach my $f (keys %pdfs) { print "."; my $o = "$ROOT/tmp/" . $indexname . '/' . $pdfs{$f}; if ((not -e $o) && (not -e "$o.txt")) { if ($f =~ /^http:\/\/doi.acm.org/) { getACM($f,$o); } elsif ($f =~ /^http:\/\/dx.doi.org/) { getDOI($f,$o); } else { `wget -q -U Mozilla -O $o '$f'`; } } } print "
\n"; print "    Converting to text..."; foreach my $f (keys %pdfs) { print "."; my $o = "$ROOT/tmp/" . $indexname . '/' . $pdfs{$f}; if (not -e $o) { next; } if (-e "$o.txt") { next; } #print "
[$f / $pdfs{$f}]\n"; `$ROOT/pdftotext $o $o.txt`; #`rm $o`; } print "
\n"; open I, "> $ROOT/index/$indexname" or die "cannot open $ROOT/index/$indexname for writing ($!)"; print I $url . "\n"; print I $name . "\n"; print "    Extracting reference text..."; foreach my $f (keys %pdfs) { print "."; my $txt = "$ROOT/tmp/" . $indexname . '/' . $pdfs{$f} . '.txt'; if (not -e $txt) { next; } my $pdf = "$ROOT/tmp/" . $indexname . '/' . $pdfs{$f}; my %words = (); #print "
[$f / $pdfs{$f}]\n"; open F, "cat $txt | $ROOT/getreftext.pl | /home/hal/bin/preprocess.perl -q 2>&1 |" or next; #open F, "$ROOT/pdftotext $pdf - | $ROOT/getreftext.pl | /home/hal/bin/preprocess.perl -q |" or next; while () { #print "[$_]
"; chomp; $_ = lc($_); s/[^A-Za-z ]/ /g; my @w = split /[\s]+/, $_; foreach my $w (@w) { if (length($w) > 3) { $words{$w}++; } } } close F; my $title = ''; open F, $txt or next; $title = ; chomp $title; if ($title =~ /^Journal of Machine Learning Research/) { $title = ; $title = ; $title = ; $title = ; chomp $title; } close F; $title =~ s/[\s]+/ /g; if (scalar keys %words < 10) { next; } print I $f . "\t" . $pdfs{$f} . "\t" . $title . "\t"; my $first = 1; foreach my $w (sort keys %words) { if (not $first) { print I ' '; } print I $w; $first = 0; } print I "\n"; } close I or die; print "
\n"; } sub getdate { my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time); return '' . ($year+1900) . '-' . ($mon+1) . "-$mday-$hour-$min-$sec"; } sub getACM { my ($doi, $o) = @_; open DOI, "wget -q -U Mozilla -O - '$doi' |" or die; my $url = ''; while () { if (/) { $txt .= $_; } close DOI; my $url = ''; while ($txt =~ s/(.*?)<\/a>//) { my $u = $1; my $l = $2; if (($l =~ /Print PDF/) && ($u =~ /reprint/)) { $url = $1; last; } } if ($url eq '') { return; } if ($url =~ /^\//) { $doi =~ m/^http:\/\/([^\/]+)/; $url = $1 . $url; } `wget -q -U Mozilla --referer='$doi' -O $o '$url'`; }

WhatToSee

Compare to Known Index

Submit a New Index

Index of $title

Comparing $file with index of $name