#!/usr/bin/perl # # httpGrab.pl - use http to grab a file across the Web. # # Copyright 2001 G. Wade Johnson # # This script is released under the Perl Artistic License. use Getopt::Long; use LWP::UserAgent; use strict; use warnings; my $VERSION = 0.95; my %opts = (); Getopt::Long::Configure( qw/require_order bundling no_ignore_case/ ); GetOptions( \%opts, qw/B p:i b h r s m=s a=s c=s@ t=s H=s% x=s f=s A=s/ ) or Usage(); my $all; my $auth; my @urls = (); my $timeref = sub { return time; }; my $method = $opts{m} || 'GET'; my $agent = $opts{a} || "httpGrab/$VERSION"; my $type = $opts{t} || 'application/x-www-form-urlencoded'; my $bProfile = exists $opts{p}; my $passes = $opts{p} || 0; # Print all if none of the response options picked. $all = !grep { exists $opts{$_} } qw/b B h r/; if(exists $opts{A}) { eval { require MIME::Base64; }; die "MIME::Base64 required for WWW-Authenticate support.\n" if $@; $auth = MIME::Base64::encode( $opts{A}, '' ); } my $ua = new LWP::UserAgent; $ua->agent( $agent ); if(exists $opts{f}) { open( my $fh, '<', $opts{f} ) or die "Unable to open \'$opts{f}\': $!\n"; chomp( @urls = <$fh> ); close( $fh ); @urls = grep { length $_ } @urls; # Remove empty lines } else { Usage() unless @ARGV; $urls[0] = shift; } my $make_request = $opts{s} ? sub { return $ua->simple_request( shift ); } : sub { return $ua->request( shift ); }; if($bProfile) { eval { require Time::HiRes; $timeref = sub { return Time::HiRes::time(); }; }; print STDERR "HiRes Timing not available.\n" if $@; my $bShowUrl = @urls > 1; foreach my $url (@urls) { print "Url: $url\n" if $bShowUrl; http_grab_profile( $url, $passes ); } } else { http_grab( $_ ) foreach @urls; } # --------------------------------------------------- # Perform the HTTP request and display appropriate parts # # Input: $url target url for the request sub http_grab { my $url = shift; my $rq = setup_request( $url ); my $rs = $make_request->( $rq ); if($all) { print $rs->as_string; } else { if($opts{r}) { my ($respline) = $rs->as_string =~ m/^(.*?)[\r\n]/; print "$respline\n"; } if($opts{h}) { print $rs->headers_as_string; } if($opts{b} || $opts{B}) { binmode STDOUT unless -t STDOUT or $rs->header('Content-Type') =~ m#text/#; binmode STDOUT if $opts{B}; print $rs->content; } } return; } # --------------------------------------------------- # Perform the HTTP request and time it # # Input: $url target url for the request # $passes number of times to make the request sub http_grab_profile { my $url = shift; my $passes = shift; my $rq = setup_request( $url ); if($passes < 2) { my $start = $timeref->(); $make_request->( $rq ); my $end = $timeref->(); printf "Request took %.6g seconds.\n\n", ($end-$start); return; } my $start = $timeref->(); $make_request->( $rq ) foreach 1..$passes; my $end = $timeref->(); printf "Request took %.6g seconds (total)\n\t or %.6g seconds/request.\n\n", ($end-$start), ($end-$start)/$passes; return; } # --------------------------------------------------- # Build a request based on the command line parameters and the # supplied URL # # Input: $url target url for the request # # Output: an HTTP::Request object sub setup_request { my $url = shift; if(exists $opts{x}) { my $furl = new URI::URL $url; my $scheme = $furl->scheme; $ua->proxy($scheme, $opts{x}); } my $rq = new HTTP::Request( $method, $url ); $rq->header( %{$opts{H}} ) if exists $opts{H} and keys %{$opts{H}}; $rq->header( 'Cookie' => join( '; ', @{$opts{c}} ) ) if exists $opts{c}; $rq->header( 'Authorization' => "Basic $auth" ) if exists $opts{A}; if(uc $method eq 'POST') { $rq->header( 'Content-type' => $type ) if $type; print "Enter the body of the POST request:\n"; $rq->content( join( '', <> ) ); } return $rq; } # --------------------------------------------------- # Usage message sub Usage { die <<"EOM"; Usage $0 [options] url or $0 [options] -f file Where options are any of -m method perform a 'method' request instead of a GET -a agent-string provide a user-agent string -c cookie provide a cookie, can be used multiple times for multiple cookies. -t content-type provide a content-type for POST requests -x proxy provide a proxy server for the request -H header=value provide header data for the request, can be used multiple times, argument is of the form header=value -A userid:password provide a userid and password string for HTTP Basic authentication -b output the body of the response -B output the body of the response, forced binary write -h output the headers of the response -r output the response line -p [n] profiling, return the time taken in seconds, supports an optional number of repetitions -f file load urls from a file instead of from command line -s simple request, do not follow redirects EOM return; } __END__ =head1 NAME htpGrab.pl =head1 Purpose B uses the LWP library to make HTTP requests. =head1 SCRIPT CATEGORIES HTTP - suggested =head1 PREREQUISITES This script depends on both the C and C pragmas. The script also uses the C and C modules. =head1 COREQUISITES If the C module is available, it is used to generate higher-resolution timings on the time test criterion and the script timing. If the C module is available, it can be used to generate Basic HTTP authentication for a request. =head1 OSNAMES any. Tested on MSWin32 and Linux. =head1 Description B uses the LWP library to make HTTP requests. Although the output is normally written to STDOUT, B can also be used in simple profiling and other tests. B began as a simple script to understand the C module. Although it was originally written as a throw-away script, it was useful in working on dynamic web sites. As time went on, the script was enhanced with various extra features to make it useful for the work I was doing. This accumulation of features has resulted in a script unlike any other of its type. =head1 Usage Usage httpGrab.pl [options] url or httpGrab.pl [options] -f file Where options are any of -m method perform a 'method' request instead of a GET -a agent-string provide a user-agent string -c cookie provide a cookie, can be used multiple times for multiple cookies. -t content-type provide a content-type for POST requests -x proxy provide a proxy server for the request -H header=value provide header data for the request, can be used multiple times, argument is of the form header=value -A userid:password provide a userid and password string for HTTP Basic authentication -b output the body of the response -B output the body of the response, forced binary write -h output the headers of the response -r output the response line -p [n] profiling, return the time taken in seconds, supports an optional number of repetitions -f file load urls from a file instead of from command line -s simple request, do not follow redirects The options are separated into four groupings. The first set defines the request. The second set defines what parts of the response are printed. The third set specifies profiling options. The last option specifies a file containing a list of URLs to request. =head2 Request Options The B<-m> option allows the user to specify the HTTP method to use on this request. The default is 'GET'. If the method specified is 'POST', B will retrieve the body of the request from STDIN. The B<-a> option specifies a user agent identification. This is particularly useful for pages that have different behavior for different browsers. The default user-agent string is 'httpGrab/0.95'. The B<-c> option specifies a I to be sent with the request. The B<-c> option may be supplied multiple times to send multiple cookies. The value of the argument for this option is of the form 'name=value'. The B<-t> option specifies the content-type header for a 'POST' request. The default is 'application/x-www-form-urlencoded'. This content type is ignored unless the specified method is 'POST'. The B<-x> option specifies a proxy server to use when attempting to reach the specified URLs. The B<-H> option provides a header for the request. This option can be supplied multiple times for multiple headers. The argument for this option is of the form 'header=value'. The B<-A> option provides a userid and password for use with the HTTP Basic authentication scheme. =head2 Response Options By default, B prints all of the response to STDOUT. This behavior can be modified through the use of one or more of the following options. To duplicate the default behavior, use the options B<-rhb>. The B<-b> option causes the body of the response to be written to STDOUT. The B<-h> option causes the headers of the response to be written to STDOUT. The B<-r> option causes the response line to be written to STDOUT. The B<-B> option causes the body of the response to be written to STDOUT, just like the B<-b> option. However, this option causes the output to be written as binary. This is particularly useful to allow the retrieval of binary data that has been misidentified by the server without translation by some operating systems. =head2 Profiling Option The profiling option performs only the most basic timing test. The time is measured from the beginning of the request until the response is completely returned. The profiling in B does not (currently) support downloading any embedded components of the page, such as images or stylesheets. The B<-p> option turns on profiling and optionally supplies a number of times to repeat the request for better accuracy. =head2 URL File The B<-f> option specifies a file from which to read a list of URLs. These URLs will be requested in order by B. The only possible surprising result is the interaction between B<-f> and B<-p>. If the B<-p> supplies a number, B makes the request multiple times on the first URL. Then, it runs makes multiple requests on the second URL, etc. =head1 Outstanding issues In general B works fairly well, but there are a few features I would like to add, at some point. =over 4 =item * Ability to do SSL. =item * Ability to retrieve embedded content and stylesheets for timing. =back To my knowledge there are no bugs in the current release. =cut