#!/usr/bin/perl
# jelparse.pl cfb 2A07
use LWP::Simple;
use XML::Writer;

$_ = get("http://www.aeaweb.org/journal/elclasjn.html");
s/\r//g;
# multi-line category names
s/\n +([^ <])/$1/g;
s/\n +/\n/g;
# images
s/<img [^>]+>//g;

# three bad bad mistakse
s|<font size=-1><b><font size=-1>(<b>B51 - Socialist; Marxian; Sraffian)|<dd>$1|;
s| </b> *\n<dd><b>(and General Outlook)</b></dd>|$1|;
s|</b></dd>\n<dd><b>( Elections, Legislatures, and Voting Behavior)</b></dd>|$1|;
s|</b></dd>\n<dd><b>( Efficiency Wage Models, and Internal Labor Markets)</b></dd>|$1|;

#
# sometimes the <br> is a start for a category.
# regularise that
s/<br>([A-Z])/\n<dd><b>$1/g;
# headers for top categoiris
s!<a href="http://www.econlit.org/elsub\w.html">!!g;
s/&amp;/&/;
while(s|<d[td]> *<b> *([A-Z]+\d*) +-* ([\dA-Z:;\.\-,a-z ()&'/]+)||) {
    my $cat=$1;
    my $des=$2;
    $des=~s/^[ \-]+//;
    $des=~s/ +$//;
    $jel{$cat}=$des;
}



my $out = new IO::File("> jel1991.xsd");
my $x = new XML::Writer(OUTPUT => $out, DATA_MODE => 1,  DATA_INDENT => 1);
$x->xmlDecl();
$x->startTag('xs:schema', 'xmlns:xs' => "http://www.w3.org/2001/XMLSchema",
             'elementFormDefault'=>"qualified",
             'attributeFormDefault'=>"unqualified",
             'xmlns'=>"http://amf.openlib.org",
             'targetNamespace'=>"http://amf.openlib.org");

$x->startTag('xs:simpleType', 'name'=>"jelElement");
$x->startTag('xs:restriction', 'base'=>"xs:string");

foreach(sort keys %jel) {
 $x->startTag('xs:enumeration', 'value'=>"$_");
 $x->startTag('xs:annotation');
 $x->startTag('xs:documentation');
 $x->characters($jel{$_});
 $x->endTag;
 $x->endTag;
 $x->endTag;
}
$x->endTag;
$x->endTag;

# create the list type
$x->startTag('xs:simpleType', 'name'=>"jel1991");
$x->emptyTag('xs:list', 'itemType'=>"jelElement");
				       $x->endTag;
$x->endTag;
$out->close;

exit;
