#!/usr/bin/perl
#
# did_archive - Produces a list of Castaway files that are available for
#               download from the BBC Desert Island Discs site.
#
# Version 1.0 (C) 2012 Andrew Rawlins (awr-didarchive@fermit.org.uk)
#
# Please be considerate when using this script to download the archive, 
# don't use it to hammer the BBC download server.  Try splitting the 
# output to multiple files and run each in turn over time.  
#
# Usage: perl did_archive.pl > get_did.sh

use WWW::Mechanize;

# Work out how many castaway pages we need to look for ...
my $url = "http://www.bbc.co.uk/radio4/features/desert-island-discs/find-a-castaway#/page/1";
my $mech  = WWW::Mechanize->new();
$mech->get( $url );
my @pages = $mech->links();

# Castaways always appear twice ... this is a bit hacky but allows us to
# find just the castaway links.
my $first_link = "";
my $second_link = "";
my $found_did = 0;

# Calculate the size of the archive
foreach my $pages (@pages) 
{

	if ($pages->url() =~ /page/) 
	{
		$plink = $pages->text();
		if ($plink > 145) 
		{
			$last_page = $plink;
		}
	}
}

# Itterate the archive and look for valid castaways 
for($i = $last_page; $i >=1; $i--) 
{	

	$castaway_url = "http://www.bbc.co.uk/radio4/features/desert-island-discs/find-a-castaway/page/" . $i;
	print "# Getting Castaways from - " . $castaway_url . "\n";

	my $castaway_mech  = WWW::Mechanize->new();
	$castaway_mech->get( $castaway_url );
	my @links = $castaway_mech->links();

	# Now we get the files
	foreach my $link (@links) 
	{

		$first_link = $second_link;
		$second_link = $link->text();

		if ($link->url() =~ /castaway/) 
		{
			$second_link = $link->text();
		}

		if ($found_did == 1) 
		{
			$found_did = 0;
			if ($link->url() =~ /downloads.bbc.co.uk/) 
			{
				$castaway = lc($first_link);
				$castaway =~ s/\s/_/g;
				$castaway =~ s/'//g;

				$url = $link->url();
				$url =~ s/\s//g;
				@file = split(/_/, $url);
				@file = split(/-/, @file[1]);
				$filename = @file[0];	
		
				print "wget " . $url . " -O " . $filename . "_" . $castaway . ".mp3\n";

			}
		}

		if ($second_link eq $first_link) 
		{
			$found_did = 1;	
		}

	}

	# Play nice .... please!
	sleep(10);

}
