Help help! Writer trying to program!
am 23.02.2005 17:41:58 von Andrew.Johnson------_=_NextPart_001_01C519C6.979D0D08
Content-Type: text/plain;
charset="us-ascii"
Content-Transfer-Encoding: quoted-printable
Hello there,
I wrote a script to scrape businessweek's search results. It
worked fine, but now I am trying to authenticate my agent to =
businessweek
first, before I do my search, so that my search results don't point at
register pages, and so I can access the results and parse them. I =
realize my
code is ghetto, but that's because I did not understand the better Perl =
HTML
parsing modules.=20
=20
The first script is my script that works.=20
=20
The second is my mangled attempt to authenticate.=20
=20
Any help would be much appreciated.
=20
use LWP::Simple;
use HTML::SimpleParse;
use Win32API::File 0.08 qw( :ALL );
use LWP::UserAgent;
use Win32::OLE;
use Win32::SAM;
use Win32::Slingshot;
=20
$| =3D 1;
my @words =3D ('Different',
'"key+words"');
=20
my $ref =3D -1;
foreach (@words){
$ref++;
@index[$ref]=3Dget =20
=20
("http://search.businessweek.com/Search?searchTerm=3D@words[ $ref]&skin=3D=
Busines
sWeek&x=3D9&y=3D5");
$p =3D new HTML::SimpleParse( $index[$ref] );
open(OUTFILE, ">output[$ref].txt") or die "Can't open
output.txt: $!";
=20
$flag =3D 0;
$test=3D0;
=20
foreach ($p->tree) { =20
if ($p->execute($_) =3D~ /Results /) =20
{
$flag=3D1;
}
if ($flag==1) =20
{
=20
$test++;
print OUTFILE $p->execute($_);
if ($p->execute($_) =3D~ /Result page/)
{=20
$flag =3D 0;}
}
=20
}
print "There were $test lines saved for parsing for =
@words[$ref]
\n";
close OUTFILE;
open(INFILE, "output[$ref].txt") or die "Can't open =
output.txt:
$!";=20
open(OUTFILE, ">goodies[$ref].txt") or die "Can't open
goodies.txt: $!";
=20
while (
{
if ($_ =3D~ /
{
($url,$BetweenTheBold) =3D $_ =3D~ =
/.*'(.*)'.*(.*)<\/b>/ ;
print OUTFILE "$url\n";
print OUTFILE "$BetweenTheBold\n";
}
elsif ($_ =3D~ /\d{2}/ )
{($date) =3D $_ =3D~=20
/-.*((January|February|September|November|December|March|Apr il|May|June|J=
uly
|Augu
st|October).{2}.*\d{4}).*/ ;
print OUTFILE "$date\n\n";
} =20
}
close INFILE;
close OUTFILE;
}
=20
my $var=3D-1;
open(OUTFILE, ">total.txt") or die "Can't open total.txt: $!";=20
while ($var < $ref)
=20
{ $var++;
open(INFILE, "goodies[$var].txt") or die "Can't open
goodies.txt: $!";
while (
{if ($_ =3D~ /\w/)
{print OUTFILE $_;}=20
}
close INFILE;
DeleteFile ("goodies[$var].txt");
DeleteFile ("output[$var].txt");
}
close OUTFILE;
=20
AND WITH AUTHENTICATION
=20
use LWP::Simple;
use HTML::SimpleParse;
use Win32API::File 0.08 qw( :ALL );
use LWP::UserAgent;
use Win32::OLE;
use Win32::SAM;
use Win32::Slingshot;
=20
$| =3D 1;
my @words =3D ('Different',
'"key+words"');
=20
#AUTHENTICATE
=20
my $browser =3D LWP::UserAgent->new;
$browser->credentials(
'www-secure.businessweek.com',
'',
'andrewljohnson' =3D> 'hermit85'
);
=20
=20
my $ref =3D -1;
foreach (@words){
$ref++;
@index[$ref]=3D$browser->get =20
=20
("http://search.businessweek.com/Search?searchTerm=3D@words[ $ref]&skin=3D=
Busines
sWeek&x=3D9&y=3D5");
$p =3D new HTML::SimpleParse( $index[$ref] );
open(OUTFILE, ">output[$ref].txt") or die "Can't open
output.txt: $!";
=20
$flag =3D 0;
$test=3D0;
=20
foreach ($p->tree) { =20
if ($p->execute($_) =3D~ /Results /) =20
{
$flag=3D1;
}
if ($flag==1) =20
{
=20
$test++;
print OUTFILE $p->execute($_);
if ($p->execute($_) =3D~ /Result page/)
{=20
$flag =3D 0;}
}
=20
}
print "There were $test lines saved for parsing for =
@words[$ref]
\n";
close OUTFILE;
open(INFILE, "output[$ref].txt") or die "Can't open =
output.txt:
$!";=20
open(OUTFILE, ">goodies[$ref].txt") or die "Can't open
goodies.txt: $!";
=20
while (
{
if ($_ =3D~ /
{
($url,$BetweenTheBold) =3D $_ =3D~ =
/.*'(.*)'.*(.*)<\/b>/ ;
print OUTFILE "$url\n";
print OUTFILE "$BetweenTheBold\n";
}
elsif ($_ =3D~ /\d{2}/ )
{($date) =3D $_ =3D~=20
/-.*((January|February|September|November|December|March|Apr il|May|June|J=
uly
|Augu
st|October).{2}.*\d{4}).*/ ;
print OUTFILE "$date\n\n";
} =20
}
close INFILE;
close OUTFILE;
}
=20
my $var=3D-1;
open(OUTFILE, ">total.txt") or die "Can't open total.txt: $!";=20
while ($var < $ref)
=20
{ $var++;
open(INFILE, "goodies[$var].txt") or die "Can't open
goodies.txt: $!";
while (
{if ($_ =3D~ /\w/)
{print OUTFILE $_;}=20
}
close INFILE;
DeleteFile ("goodies[$var].txt");
DeleteFile ("output[$var].txt");
}
close OUTFILE;
=20
=20
=20
( Andrew Johnson )=20
) Marketing Writer (
( Elias/Savion Advertising )
( Phone: 412.642.7700 Fax 412.642.2277 )
) www.elias-savion.com (
( andrew.johnson@elias-savion.com )
=20
=20
------_=_NextPart_001_01C519C6.979D0D08--