DELETE THIS PREFIX BEFORE RUNNING! #! /usr/bin/perl -w
use strict;
# Hack to concatenate all of the ITS RDF Subgroup minutes into one big page,
# to assist in finding things. Runs on Linux.
# NOTE: The last attempted minutes download will result in 'ERROR 404:
# Not Found' message. This is normal.
#
# Run as:
# ./hl7-its-fhir-rdf-minutes.perl > /tmp/all-minutes.html
#
# Author: David Booth
# Date: 9/14/2015
# License: Apache 2.0
my $startYear = 2014;
my $yearListUrl = "http://wiki.hl7.org/index.php?title=Category:ITS_RDF_Minutes_2015";
my $reverse = 1; # Reverse chronological order?
my $tmp = "/tmp/its-rdf-minutes-tmp.html";
my $isFirstPage = 1;
my $year = $startYear;
my @bodies = ();
my $beforeBodies = "";
my $afterBodies = "";
while (1) {
die if $year > 2035;
my $yurl = $yearListUrl;
$yurl =~ s/2015/$year/;
my $yresponse = `wget -k -O $tmp '$yurl' ; cat $tmp`;
last if $yresponse =~ m|There is currently no text in this page|i;
last if !$yresponse;
$yresponse =~ m|Sub category of| || die "Unknown response from $yurl\n ";
my @urls = split(/\n/, &Pipe($yresponse, '/home/dbooth/bin/grepurls'));
@urls = grep {m/^http(s?)\:/} @urls;
@urls = grep {m/ITS_RDF_Concall_Minutes/} @urls;
# print "URLS: @urls\n";
# exit 0;
foreach my $url (@urls) {
my $body = "";
warn "Getting $url ...\n";
my $page = `wget -k -O $tmp '$url' ; cat $tmp`;
$page =~ m|ITS RDF .* Minutes|i || die "Unknown response from $url : \n$page\n ";
# Most wiki minutes pages are stubs that point to
# W3C minutes pages.
# Meeting log: http://www.w3.org/2014/11/04-hcls-minutes.html
if ($page =~ m/Meeting log:\s*]+))"/ims) {
$url = $2;
$page = `wget -k -O $tmp '$url' ; cat $tmp`;
$page =~ m|Attendees|i || die "Unknown response from $url : \n$page\n ";
}
$page =~ m/\A.*\/ims || die;
my $beforeBody = $&;
my $afterBody = $';
if ($isFirstPage) {
# print "$beforeBody\n";
$beforeBodies = "$beforeBody\n";
}
$page = $afterBody;
$page =~ s/\<\/body\>.*\Z//ims || die;
# print "\n";
$body .= "\n";
# print "
\n
\nMinutes downloaded from: $url
\n";
$body .= "
\n