DELETE THIS PREFIX BEFORE RUNNING! #! /usr/bin/perl -w use strict; # Hack to concatenate all of the ITS RDF Subgroup minutes into one big page, # to assist in finding things. Runs on Linux. # NOTE: The last attempted minutes download will result in 'ERROR 404: # Not Found' message. This is normal. # # Run as: # ./hl7-its-fhir-rdf-minutes.perl > /tmp/all-minutes.html # # Author: David Booth # Date: 9/14/2015 # License: Apache 2.0 my $startYear = 2014; my $yearListUrl = "http://wiki.hl7.org/index.php?title=Category:ITS_RDF_Minutes_2015"; my $reverse = 1; # Reverse chronological order? my $tmp = "/tmp/its-rdf-minutes-tmp.html"; my $isFirstPage = 1; my $year = $startYear; my @bodies = (); my $beforeBodies = ""; my $afterBodies = ""; while (1) { die if $year > 2035; my $yurl = $yearListUrl; $yurl =~ s/2015/$year/; my $yresponse = `wget -k -O $tmp '$yurl' ; cat $tmp`; last if $yresponse =~ m|There is currently no text in this page|i; last if !$yresponse; $yresponse =~ m|Sub category of| || die "Unknown response from $yurl\n "; my @urls = split(/\n/, &Pipe($yresponse, '/home/dbooth/bin/grepurls')); @urls = grep {m/^http(s?)\:/} @urls; @urls = grep {m/ITS_RDF_Concall_Minutes/} @urls; # print "URLS: @urls\n"; # exit 0; foreach my $url (@urls) { my $body = ""; warn "Getting $url ...\n"; my $page = `wget -k -O $tmp '$url' ; cat $tmp`; $page =~ m|ITS RDF .* Minutes|i || die "Unknown response from $url : \n$page\n "; # Most wiki minutes pages are stubs that point to # W3C minutes pages. # Meeting log: http://www.w3.org/2014/11/04-hcls-minutes.html if ($page =~ m/Meeting log:\s*]+))"/ims) { $url = $2; $page = `wget -k -O $tmp '$url' ; cat $tmp`; $page =~ m|Attendees|i || die "Unknown response from $url : \n$page\n "; } $page =~ m/\A.*\/ims || die; my $beforeBody = $&; my $afterBody = $'; if ($isFirstPage) { # print "$beforeBody\n"; $beforeBodies = "$beforeBody\n"; } $page = $afterBody; $page =~ s/\<\/body\>.*\Z//ims || die; # print "\n"; $body .= "\n"; # print "
\n
\nMinutes downloaded from:
$url
\n"; $body .= "
\n
\nMinutes downloaded from: $url
\n"; # print "$page\n"; $body .= "$page\n"; push(@bodies, $body); $isFirstPage = 0; } $year++; } if (!$isFirstPage) { # print "\n\n\n"; $afterBodies = "\n\n\n"; } if ($reverse) { @bodies = reverse @bodies; } my $allBodies = join("", @bodies); print $beforeBodies; print $allBodies; print $afterBodies; # unlink($tmp); exit 0; ################ Pipe ############### sub Pipe # Perl code snippet to pipe a string (as stdin) through a command, # returning the stdout of the command either as a list or a string. { @_ == 2 || die; my ($s, $cmd) = @_; my $tmp = "/tmp/pipe.$$.tmp"; open(my $fh,">$tmp") || die "ERROR: Cannot open for write: $tmp "; print $fh $s; close($fh); my $fullcmd = "cat $tmp | $cmd"; if (wantarray) { my @result = `$fullcmd`; unlink($tmp); return(@result); } else { my $result = `$fullcmd`; unlink($tmp); return($result); } }