Sending output to Spreadsheet
am 03.01.2007 06:22:51 von dysgraphia(Reposted in new thread...)
Hi, New to Perl, using ActiveState 5.8, Win XP
I'm not sure if this is the best ng...should misc be my first
port of call?
I am trying to adapt Brent Hughes rget-links.pl original code by
collecting discovered web links into a spreadsheet for later use.
BTW, Any errors are due to me, not Brent!
The code runs OK and prints out the web links found into the command window.
I have looked at the Spreadsheet::SimpleExcel module but I cannot work
out the syntax to get the accumulated links into my_List.xls file.
Any suggestions will be appreciated!
Cheers, Peter
#!/usr/bin/perl
use warnings;
use strict;
package RGetLinks;
use LWP::UserAgent;
use HTML::LinkExtor;
use URI::URL;
use Getopt::Long;
use Spreadsheet::SimpleExcel;
$| = 1;
# global data for this program
my $depth;
my %files;
# command line options
my $opt_depth = 4;
# retrieve command line options
my $options = GetOptions ("depth=i" => \$opt_depth); # numeric
my $url = 'http://somesite/';
# abort if the options are improperly formatted
if(!defined $url){ usage(); }
# program enters actual processing at this point
rgetlinks($url,$opt_depth);
# create a new instance of Excel
my $excel = Spreadsheet::SimpleExcel->new();
# add worksheet
$excel->add_worksheet('Sheet1',{-headers => \@header, -data => \@data});
# print result into a file and handle error
$excel->output_to_file('c:/Documents and Settings/my_List.xls') or die
$excel->errstr();
$excel->errstr();
# Subroutines
# A routine to get links recursively
sub rgetlinks
{
my($url,$maxdepth) = @_;
chomp($url);
# initialize globals
$depth = 0;
%files = ();
# descend
rgetlinkshelper($url,$maxdepth);
}
# A helper routine to get links recursively
sub rgetlinkshelper
{
my($url,$maxdepth) = @_;
# return if too deep or already been here
if($depth >= $maxdepth || defined $files{$url})
{
return;
}
else
{
# drop down a level and add the file to the hash
$depth++; $files{$url} = 1;
# show our current location
foreach(1..$depth) {print ' ';}
print $url, "\n";
# retrieve all links
my @links = getlinks($url);
# recursive step
foreach(@links){ rgetlinkshelper($_,$maxdepth); }
# pop up a level # line 101
$depth--;
}
}
# A routine to return links from a URL
# Only retrieve links from text/html files.
my @links = ();
sub getlinks
{
my($url) = @_; # for instance
my $ua = new LWP::UserAgent;
# Make the parser. Unfortunately, we don't know the base yet
# (it might be diffent from $url)
@links = ();
my $p = HTML::LinkExtor->new(\&callback);
# Look at the header to determine what type of document we have
my $headreq = HTTP::Request->new(HEAD => $url);
my $headres = $ua->request($headreq);
my $type = $headres->header('content-type');
# only parse the document for links if it is a text or html document
if(defined $type && $type =~ /text|html/)
{
# Request document and parse it as it arrives
my $getreq = HTTP::Request->new(GET => $url);
my $getres = $ua->request($getreq, sub{ $p->parse($_[0])});
# Expand all URLs to absolute ones
my $base = $getres->base;
@links = map { $_ = url($_, $base)->abs; } @links;
}
# Return the links
return @links;
}
# Set up a callback that collects links
sub callback {
my($tag, %attr) = @_;
return if $tag ne 'a'; # we only look closer at
push(@links, values %attr);
}
# A routine to provide instructions
sub usage
{
# strip the progname with a regex
my $progname = $0;
$progname =~ s/(.*\\|.*\/)(.*)/$2/g;
# show instructions
print "\nUsage:\n\t\t",
$progname, " [args] target-url > output-file\n\n",
"Example:\n\t\t",
$progname, " --depth=4 http://www.perl.org\n\n"; # depth=3
print "Options\n", "=======\n",
"--depth\t\t",
"The maximum depth of links to traverse (default = 3)\n";
exit();
}