TABLE OF CONTENTS
- 1. URL/check_for_response
- 2. URL/extract_url
- 3. URL/get_urlid
- 4. URL/handle_url
- 5. URL/has_imagedata
- 6. URL/process_url
- 7. URL/record_url
URL/check_for_response [ Functions ]
FUNCTION
Checks if URL is fetchable.
INPUTS
url -- URL to fetch.
RESULT
The content type, e.g. "image/jpeg".
NOTES
Name conflicts with actual source code. Does not follow redirects at the moment.
SOURCE
sub URL::check_for_response { my $url = shift; my $ct; my $ua = LWP::UserAgent->new( agent => "irc-collective/0.1.4" ); my $response = $ua->head($url); Debug::print("Status code: " . $response->code()); Debug::print("Status line: " . $response->status_line()); if ($response->is_error()) { Debug::print("Request is errorneus, abort."); } elsif ($response->is_success()) { Debug::print("Request is successful, continue."); } elsif ($response->is_redirect()) { Debug::print("Got redirect, follow?"); } elsif ($response->is_info()) { Debug::print("What to do with informational response?"); } $ct = $response->content_type; Debug::print("Content-type: " . $ct); return $ct; }
URL/extract_url [ Functions ]
FUNCTION
Filters an URL from the line.
INPUTS
content -- the line, possible containing one or more URLs.
RESULT
The filtered URL if found, undef otherwise.
NOTES
Only supports ftp and http at the moment. Does not scan for multiple occurances, only first occurance is returned.
SOURCE
sub URL::extract_url { my $content = shift; my $url; if ($content =~ m/.*((ftp|http):\/\/\S+).*/) { $url = $1; } return $url; }
URL/get_urlid [ Functions ]
FUNCTION
Returns ID for new or existing URL.
INPUTS
dbh -- database handle date -- date in YYYYMMDD format, e.g. "20070813" line_no -- line number in logfile, e.g. "210"
RESULT
ID of URL stored in database.
SOURCE
sub URL::get_urlid { my $dbh = shift; my ($date, $line_no) = @_; my $urlid = 0; # see if we already have an existing urlid my $sth = $dbh->prepare( qq[SELECT urlid FROM urls WHERE from_date=? AND from_lineno=? LIMIT 1] ); $sth->execute($date, $line_no) or print STDERR "ERROR: Could not perform query to get urlid. ", "Reason: ", $dbh->errstr, "\n"; # get/set urlid if ($sth->rows != 0) { # use existing hash my $record = $sth->fetchrow_hashref; $urlid = $record->{urlid}; } else { # create a new hash my $md5 = Digest::MD5->new; $md5->add($date, $line_no); $urlid = $md5->hexdigest; } $sth->finish; return $urlid; }
URL/handle_url [ Functions ]
FUNCTION
Wrapper function to record URLs.
INPUTS
args -- dbh, year, month, day, line_no, time, nick and url
SOURCE
sub URL::handle_url{ my @args = @_; if (URL::record_url(@args) > 0) { Debug::print("Added new url."); } }
URL/has_imagedata [ Functions ]
FUNCTION
Checks content type for image tag
INPUTS
ct -- content type, e.g. "image/jpeg"
RESULT
True if image, false otherwise.
SOURCE
sub URL::has_imagedata { my $ct = shift; my $image_tag = 0; if ($ct =~ m/.*image.*/) { $image_tag = 1; } return $image_tag; }
URL/process_url [ Functions ]
FUNCTION
Wrapper function to handle images and URLs.
INPUTS
imagdir -- directory where local copy of images are stored dbh -- database handle year -- year in YYYY format, e.g. "2007" month -- month in MM format, e.g. "08" day -- day in DD format, e.g. "13" line_no -- line number in logfile, e.g. "210" time -- time in HH:MM[:SS] format, e.g. "14:48" nick -- nickname, e.g. "rjm" url -- url to record, e.g. "http://www.irc-collective.org/" ct -- content type, e.g. "image/jpeg"
SOURCE
sub URL::process_url { my $imagedir = shift; my $dbh = shift; my $year = shift; my $month = shift; my $day = shift; my $line_no = shift; my $time = shift; my $nick = shift; my $url = shift; my $ct = shift; # handle image or regular url if (URL::has_imagedata($ct)) { Image::handle_image( $imagedir, $dbh, $year, $month, $day, $line_no, $time, $nick, $url, $ct ); } else { URL::handle_url( $dbh, $year, $month, $day, $line_no, $time, $nick, $url ); # should add $ct to urls as well } # todo: return state }
URL/record_url [ Functions ]
FUNCTION
Records URL to database.
INPUTS
dbh -- database handle year -- year in YYYY format, e.g. "2007" month -- month in MM format, e.g. "08" day -- day in DD format, e.g. "13" line_no -- line number in logfile, e.g. "210" time -- time in HH:MM[:SS] format, e.g. "14:48" nick -- nickname, e.g. "rjm" url -- url to record, e.g. "http://www.irc-collective.org/"
RESULT
True on success, false otherwise.
SOURCE
sub URL::record_url { my $dbh = shift; my ($year, $month, $day, $line_no, $time, $nick, $url) = @_; my $date = "$year$month$day"; # requirement lz # prevent storing url multiple times for same logfile # usefull when invoking parselog every 10 minutes e.g. my $urlid = URL::get_urlid($dbh, $date, $line_no); my $protocol = URI->new($url)->scheme(); my $sth = $dbh->prepare( qq[INSERT INTO urls (urlid, url, from_date, from_lineno, from_time, from_nick, protocol) VALUES (?, ?, ?, ?, ?, ?, ?)]); $sth->execute( $urlid, $url, $date, $line_no, $time, $nick, $protocol ) or print "ERROR: Could not add URL to database: ", $dbh->errstr, "\n"; $sth->finish; return $sth->rows; }