#!/usr/bin/env perl
# flickr-scrape-set --- scrape flickr web sets for image urls
# Author: Noah Friedman <friedman@splode.com>
# Created: 2009-01-28
# Public domain

# $Id: flickr-scrape-set,v 1.4 2010/10/03 04:43:27 friedman Exp $

# Commentary:

# This script prints the url to each image in a flickr set.
#
# For example:
# 	$ flickr-scrape-set http://flickr.com/photos/inoah/sets/72157612135568290/
#	http://farm2.static.flickr.com/1082/3165808612_09f27725ba_o.jpg
#	http://farm2.static.flickr.com/1082/3165808612_09f27725ba_o.jpg
#	...
# 	http://farm4.static.flickr.com/3124/3165659540_6c8786a048_o.jpg
#	$
#
# These images can then be retrieved with wget, curl, etc.
#
# If the set has multiple pages, all pages will be crawled.

# By default the "original size" image urls will be retrieved.  This can be
# overridden by specifying the size as the second argument; it can be one of:
# 	"o"   (original)
#       "b"   (large) -- seems to be used when o not available
#	"l"   (large)
#       "z"   (medium640)
#	"m"   (medium500)
#	"s"   (small)
#	"t"   (thumbnail)
#	"sq"  (square)

# Sets or images which are only viewable by friends, etc. can be retrieved
# using auth information present in the firefox 3.0 or later cookies
# database.  This script doesn't presently understand the traditional
# cookies.txt format used in older versions.

# Code:

$^W = 1; # enable warnings

use LWP;
use URI;
use DBI;
use strict;

my @default_cookie_db
  = ( "$ENV{HOME}/.cookies.sqlite",
      "$ENV{HOME}/.mozilla/firefox/default/cookies.sqlite",
      "$ENV{HOME}/etc/misc/.flickr-scape-set/cookies.sqlite",
    );

sub make_cookie
{
  my ($url, $cookiedb) = @_;

  unless (defined $cookiedb)
    {
      for my $f (@default_cookie_db)
        {
          next unless -f $f;
          $cookiedb = $f;
          last;
        }
    }
  return unless (defined $cookiedb && -f $cookiedb);

  my ($host, $path) = ($1, $2) if $url =~ m|^https?://(?:\S+@)?([^/]+)(/.*)|;
  return unless $host;

  my $dbh = DBI->connect ("DBI:SQLite:dbname=$cookiedb");
  return unless $dbh;

  my $stm = "select name, value, path
             from moz_cookies
             where host = lower(?)
                   and expiry >= ?
            ";
  my $sth = $dbh->prepare ($stm);

  my $now = time ();
  my %nv;

  my @hdn;
  for (my $h = ".$host"; $h ne ''; $h =~ s/^\.[^.]+//) { push @hdn, $h }
  for my $hdn ($host, @hdn)
    {
      $sth->execute ($hdn, $now);
      while (my $row = $sth->fetchrow_hashref)
        {
          my ($name, $val, $cpath) = @$row{qw(name value path)};
          next if exists $nv{$name};
          next unless (length ($cpath) <= length ($path)
                       && substr ($path, 0, length ($cpath)) eq $cpath);
          $nv{$name} = $val unless exists $nv{$name};
        }
    }

  return unless %nv;
  return join (" ", map { sprintf ("%s=%s;", $_, $nv{$_}) } sort keys %nv);
}

# Probably needs more error checking, but I don't want to turn this into
# another full-blown lwp-request script.
sub http_get
{
  my ($url, $headers) = @_;

  my $request = HTTP::Request->new ('GET');
  my $ua      = LWP::UserAgent->new;
  my $uri     = URI->new ($url);

  while (my ($name, $val) = each %$headers)
    {
      $request->header ($name, $val);
    }

  $request->url ($uri);
  my $response = $ua->request ($request);

  return $response->content;
}

# Get all of the top pages of the set, if there are continuations.
# Returns an array of the html content of each
sub set_pages
{
  my ($first, $headers) = @_;
  $first .= "/" unless $first =~ m=/$=;

  my $base = $first;
  $base =~ s=^(.*?//.*?)/.*=$1=;

  my @to_process = ($first);
  my @pages;
  my %mapped = ($first => 1);

  while (@to_process)
    {
      my $page = shift @to_process;
      my $content = http_get ($page, $headers);
      push @pages, $content;

      while ($content =~ m|\G.*?href="(/[^"]*/?page=\d+)"|sg)
        {
          my $new = $base . $1;
          push @to_process, $new unless exists $mapped{$new};
          $mapped{$new} = 1;
        }
    }

  return \@pages;
}

# Return a list of all individual image pages in the set
sub set_links
{
  my ($pages) = @_;
  local $_;
  my @href;

  map {
    while (m|\G.*?href="([^"]*/set-\d+/)"|sg)
      {
        push @href, $1;
      }
  } @$pages;

  return \@href;
}

# Take the image page urls and get the download page for the specified size
sub size_url
{
  my ($size, $href) = @_;
  my @osize;

  for my $url (@$href)
    {
      my $s = $url;
      $s =~ s=/in/set-=/sizes/$size/in/set-=;
      push @osize, $s;
    }
  return \@osize;
}

# Scrape the download page and get the actual image url
sub image_url
{
  my ($page_url, $headers) = @_;

  my $picid = $1 if $page_url =~ m=/(\d+)/sizes=;
  my $content = http_get ($page_url, $headers);

  return join (".", $1, $2)
    if ($content =~ m/="([^"]+)_d\.(jpg|png|gif)"/
        || $content =~ m/src="([^"]+\/${picid}_.*?)\.(jpg|png|gif)/);
}

sub main
{
  my ($set_url, $size) = @_;

  return unless defined $set_url; # todo: write usage fn
  $size = 'o' unless defined $size;

  my $base = $set_url;
  $base =~ s=^(.*?//.*?)/.*=$1=;

  # Really we should be checking for the correct cookie to send for each
  # url, i.e. this lookup should be in http_get.  But for performance
  # reasons, just do it once for the top url.  Flickr doesn't have
  # path-specific or host-specific cookies that we care about.
  my %header;
  my $cookie = make_cookie ($set_url);
  $header{Cookie} = $cookie if defined $cookie;

  my $pages = set_pages ($set_url, \%header);
  my $links = set_links ($pages);
  my $orig_size_url = size_url ($size, $links);

  map { $_ = $base . $_ } @$orig_size_url;   # Modify array by side effect
  map { print image_url ($_, \%header), "\n" } @$orig_size_url;
}

main (@ARGV);

# eof