#!/usr/bin/env perl # flickr-scrape-set --- scrape flickr web sets for image urls # Author: Noah Friedman # Created: 2009-01-28 # Public domain # $Id: flickr-scrape-set,v 1.4 2010/10/03 04:43:27 friedman Exp $ # Commentary: # This script prints the url to each image in a flickr set. # # For example: # $ flickr-scrape-set http://flickr.com/photos/inoah/sets/72157612135568290/ # http://farm2.static.flickr.com/1082/3165808612_09f27725ba_o.jpg # http://farm2.static.flickr.com/1082/3165808612_09f27725ba_o.jpg # ... # http://farm4.static.flickr.com/3124/3165659540_6c8786a048_o.jpg # $ # # These images can then be retrieved with wget, curl, etc. # # If the set has multiple pages, all pages will be crawled. # By default the "original size" image urls will be retrieved. This can be # overridden by specifying the size as the second argument; it can be one of: # "o" (original) # "b" (large) -- seems to be used when o not available # "l" (large) # "z" (medium640) # "m" (medium500) # "s" (small) # "t" (thumbnail) # "sq" (square) # Sets or images which are only viewable by friends, etc. can be retrieved # using auth information present in the firefox 3.0 or later cookies # database. This script doesn't presently understand the traditional # cookies.txt format used in older versions. # Code: $^W = 1; # enable warnings use LWP; use URI; use DBI; use strict; my @default_cookie_db = ( "$ENV{HOME}/.cookies.sqlite", "$ENV{HOME}/.mozilla/firefox/default/cookies.sqlite", "$ENV{HOME}/etc/misc/.flickr-scape-set/cookies.sqlite", ); sub make_cookie { my ($url, $cookiedb) = @_; unless (defined $cookiedb) { for my $f (@default_cookie_db) { next unless -f $f; $cookiedb = $f; last; } } return unless (defined $cookiedb && -f $cookiedb); my ($host, $path) = ($1, $2) if $url =~ m|^https?://(?:\S+@)?([^/]+)(/.*)|; return unless $host; my $dbh = DBI->connect ("DBI:SQLite:dbname=$cookiedb"); return unless $dbh; my $stm = "select name, value, path from moz_cookies where host = lower(?) and expiry >= ? "; my $sth = $dbh->prepare ($stm); my $now = time (); my %nv; my @hdn; for (my $h = ".$host"; $h ne ''; $h =~ s/^\.[^.]+//) { push @hdn, $h } for my $hdn ($host, @hdn) { $sth->execute ($hdn, $now); while (my $row = $sth->fetchrow_hashref) { my ($name, $val, $cpath) = @$row{qw(name value path)}; next if exists $nv{$name}; next unless (length ($cpath) <= length ($path) && substr ($path, 0, length ($cpath)) eq $cpath); $nv{$name} = $val unless exists $nv{$name}; } } return unless %nv; return join (" ", map { sprintf ("%s=%s;", $_, $nv{$_}) } sort keys %nv); } # Probably needs more error checking, but I don't want to turn this into # another full-blown lwp-request script. sub http_get { my ($url, $headers) = @_; my $request = HTTP::Request->new ('GET'); my $ua = LWP::UserAgent->new; my $uri = URI->new ($url); while (my ($name, $val) = each %$headers) { $request->header ($name, $val); } $request->url ($uri); my $response = $ua->request ($request); return $response->content; } # Get all of the top pages of the set, if there are continuations. # Returns an array of the html content of each sub set_pages { my ($first, $headers) = @_; $first .= "/" unless $first =~ m=/$=; my $base = $first; $base =~ s=^(.*?//.*?)/.*=$1=; my @to_process = ($first); my @pages; my %mapped = ($first => 1); while (@to_process) { my $page = shift @to_process; my $content = http_get ($page, $headers); push @pages, $content; while ($content =~ m|\G.*?href="(/[^"]*/?page=\d+)"|sg) { my $new = $base . $1; push @to_process, $new unless exists $mapped{$new}; $mapped{$new} = 1; } } return \@pages; } # Return a list of all individual image pages in the set sub set_links { my ($pages) = @_; local $_; my @href; map { while (m|\G.*?href="([^"]*/set-\d+/)"|sg) { push @href, $1; } } @$pages; return \@href; } # Take the image page urls and get the download page for the specified size sub size_url { my ($size, $href) = @_; my @osize; for my $url (@$href) { my $s = $url; $s =~ s=/in/set-=/sizes/$size/in/set-=; push @osize, $s; } return \@osize; } # Scrape the download page and get the actual image url sub image_url { my ($page_url, $headers) = @_; my $picid = $1 if $page_url =~ m=/(\d+)/sizes=; my $content = http_get ($page_url, $headers); return join (".", $1, $2) if ($content =~ m/="([^"]+)_d\.(jpg|png|gif)"/ || $content =~ m/src="([^"]+\/${picid}_.*?)\.(jpg|png|gif)/); } sub main { my ($set_url, $size) = @_; return unless defined $set_url; # todo: write usage fn $size = 'o' unless defined $size; my $base = $set_url; $base =~ s=^(.*?//.*?)/.*=$1=; # Really we should be checking for the correct cookie to send for each # url, i.e. this lookup should be in http_get. But for performance # reasons, just do it once for the top url. Flickr doesn't have # path-specific or host-specific cookies that we care about. my %header; my $cookie = make_cookie ($set_url); $header{Cookie} = $cookie if defined $cookie; my $pages = set_pages ($set_url, \%header); my $links = set_links ($pages); my $orig_size_url = size_url ($size, $links); map { $_ = $base . $_ } @$orig_size_url; # Modify array by side effect map { print image_url ($_, \%header), "\n" } @$orig_size_url; } main (@ARGV); # eof