Here's a quick hack to monitor patent and application searches.
First, go to the USPTO search page, perform a search and copy the search url. For example, here's the search url for microsoft's patent applications:
http://appft1.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=%2Fnetahtml%2FPTO%2Fsearch-adv.html&r=0&p=1&f=S&l=50&Query=an%2Fmicrosoft%24&d=PG01
If you want to run as a script and produce a static rss file, you'd simply do something like this:
rss-uspto.pl 'http://appft1.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=%2Fnetahtml%2FPTO%2Fsearch-adv.html&r=0&p=1&f=S&l=50&Query=an%2Fmicrosoft%24&d=PG01' > ~/public_html/rss/microsoft-applications.rss
and point your aggregator to http://127.0.0.1/~user/rss/microsoft-applications.rss.
To run as a cgi script, you might have to copy or symlink the filename to rss-uspto.cgi and/or place it in your cgi-bin directory. Then point your aggregator to:
http://127.0.0.1/~user/cgi-bin/rss-uspto.cgi?http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=%2Fnetahtml%2Fsearch-adv.htm&r=0&p=1&f=S&l=50&Query=an%2Fmicrosoft%24&d=ptxt
or whatever your local variant happens to be.
By default, only the link to the patent and the title are present in the feed, but if you want to add the date and abstract, modify the 'DETAILS' constant in the script and it will cause each page in the search results to be requested to extract the extra info.
I also included a direct link to Pat2PDF for those who want an alternative to the TIFF viewer on the USPTO.
The Code
#!/usr/bin/env perl
use strict;
use warnings;
use Date::Parse;
use LWP::Simple;
use XML::RSS;
## This script can be called on the commandline:
## rss-uspto.pl 'http://search.url' > search1.rss
## or as a cgi:
## http://127.0.0.1/cgi-bin/rss-uspto.cgi?http://search.url
##
## By default, the script will only extract the basic information from the
## search result page, but changing the following constant will allow it
## to request each patent link and pull extra details (date and abstract).
use constant DETAILS => 0;
sub get_page ($);
sub make_permalink ($);
my ($url, $cgi);
if ($url = $ENV{QUERY_STRING})
{
$cgi = 1;
print "Content-type:text/plain\n\n";
}
else { $url = shift; }
exit unless $url;
## Issued patents and patent applications live on different servers.
my ($server) = $url =~ m[^(http://(?:appft1|patft)\.uspto\.gov)/]i;
my $html = get($url) or exit;
my @items;
my $ctitle = 'USPTO Search';
my ($title) = $html =~ m[<title>([^<>]+)</title>]i or exit;
my ($query) = $title =~ m[
^(?:Patent|PreGrant\ Publication)\ Database\ Search\ Results:\
(.*?)
in\ (?:1976\ to\ present|PGPUB\ Production\ Database)$
]xs;
if ($query)
{
$ctitle .= ": $query";
for my $block ($html =~ m[<TR><TD valign=top>\d+</TD>(.*?)</TR>]gs)
{
my ($url, $pnum, $ititle) = $block =~ m[
<A \s+ HREF=([^>]+)>\s*((?:[DHT]|PP|RE)?[\d,]+)</A>
.*?
<A \s+ HREF=\1>([^<]+)</A>
]gsx or next;;
s/\s+/ /g, s/^ | $//g for ($pnum, $ititle);
my $link = make_permalink($pnum);
my $item = { pnum=>$pnum, title=>"$pnum: $ititle", link=>$link };
$item = get_page($link) || $item if DETAILS;
push @items, $item;
}
}
## For issued patents, queries returning a single result are redirected
## to the patent document instead of displaying the search result listing.
elsif ($title eq 'Single Document')
{
my $redir = m[<META HTTP-EQUIV="REFRESH" CONTENT="\d+;URL=([^"]+)>];
exit unless $redir;
if ($redir =~ /Query=([^&]+)/)
{
my ($query) = $1;
$query =~ s/\+/ /g;
$ctitle .= ": $query";
}
my $item = get_page($server . $redir) or exit;
push @items, $item;
}
exit unless @items;
my $rss = XML::RSS->new(version=>'2.0');
$rss->channel(
title => $ctitle,
link => $url,
ttl => 1440,
);
for my $item (@items)
{
my $date = str2time($item->{date});
$date = rfc822date($date) if $date;
$item->{desc} .= '<p><a href="http://www.pat2pdf.com/cgi-bin/patent_pdf.'
. 'cgi?patent_number=' . $item->{pnum} . '">Get the comlete PDF</a>';
$rss->add_item(
title => $item->{title},
link => $item->{link},
pubDate => $date,
description => $item->{desc},
);
}
print $rss->as_string;
## Search results produce links that are offsets into the search, so are not
## completely stable. Since we know the patent number, use a direct link to
## that instead.
sub make_permalink ($)
{
my $pnum = shift;
my ($d, $e) = $server =~ m[^http://patft]i ? qw(PALL WKU) : qw(PG01 PGNR);
return "$server/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=$d&p=1&u=" .
"/netahtml/srchnum.html&r=1&f=G&l=50&s1=$pnum.$e.";
}
sub get_page ($)
{
my $url = shift;
my $html = get($url) or return;
my ($title) = $html =~ m[<title>([^<>]+)</title>]i or return;
my ($pnum) = $title =~ m[
^United\ States\ Patent(?:\ Application)?:\ ((?:[DHT]|PP|RE)?[\d,]+)$
]x or return;
my ($date, $ititle, $desc) = $html =~ m[
<TABLE\ WIDTH="100%"> .*? United\ States\ Patent .*?
<B>([\w\s,]+)</B>\s*</TD>\s*</TR>\s*</TABLE>
.*?
<font\ size="\+1">(.*?)</font><BR> \s*
<BR><CENTER><B>Abstract</B></CENTER> \s*
<P>(.*?)</P>
]sx;
return unless $ititle and $desc;
s[</?[BI]>][]g, s/\s+/ /g, s/^ | $//g for ($ititle, $desc, $date);
$ititle = "$pnum: $ititle";
my $link = make_permalink($pnum);
return { link=>$link, title=>$ititle, date=>$date, desc=>$desc };
}
sub rfc822date
{
my $time = shift;
my ($sec, $min, $hour, $mday, $mon, $year, $wday) = (gmtime($time))[0..6];
my $day = (qw(Sun Mon Tue Wed Thu Fri Sat))[$wday];
my $month = (qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec))[$mon];
return sprintf "%s, %02d %s %d %02d:%02d:%02d GMT",
$day, $mday, $month, $year+1900, $hour, $min, $sec;
}
See also:
original blog entry