#!/usr/bin/perl
# Vyakarana/Dhatupatha/mdhvindx/disp/mdhv-main.pl Mar 8, 2009
# read the entries of MDhVIndxShastri.xml.
# if gaRaNum is not 1-10, skip
#use lib qw(../../util/);
#use sandhi::sandhi;
#use Alphabetize;
BEGIN {require "/Volumes/Sanskrit-Raid/jfunderburk/util/sandhi/sandhi.pm"; 
 sandhi->import qw(sandhi sandhioptions);
}
BEGIN {require "/Volumes/Sanskrit-Raid/jfunderburk/util/Alphabetize.pm"; 
 Alphabetize->import qw(Sanskrit_order Sanskrit_order_num);
}
use Getopt::Std;
%options=();
getopts("o:n",\%options);
#print 'argv[0] = ' . $ARGV[0] . "\n";
#print "-o $options{o}\n" if defined $options{o};
my ($err);
my $args;
if (defined $options{o}) {
    $args = $options{o};
}
my $compound_ans="C";
my $vedic_ans="N";
my $close_ans="S";
my $despace_ans;
if ($args) {
    if (3 <= length($args)) {
	my @a = split('',$args);
	$compound_ans = $a[0];
	$vedic_ans = $a[1];
	$close_ans = $a[2];
	$despace_ans=$a[3];
    }
}
$err = sandhioptions($compound_ans,$vedic_ans,$close_ans,$despace_ans);
if ($err ne 0) {
    die "sandhioptions returns err = $err\n";
}
my $n = 0; # number of lines read
my $nout = 0; # number of lines written
my $nentry = 0; # of entries processed
my $nerr = 0;
my ($gaRaNum,$gaRa_name);
my @gaRa_abbrev = qw(skip BvAdi adAdi juhotyAdi divAdi svAdi tudAdi ruDAdi tanAdi kryAdi curAdi kaRqvAdi sOtraDAtu nAmaDAtu Rijanta );
my @outans;
##Step1: -- construct @outans by reading lines from STDIN
while (<>) {
    $n++;
    $s = $_;
    chomp($s);
    if ($s =~ /^ *<entry.*<gaRaNum.*?>(.*?)<\/gaRaNum>/) {
	$gaRaNum = $1;
	$gaRa_name = $gaRa_abbrev[$gaRaNum];
	if ($gaRa_name) {
	    $nentry++;
	    $ans = process_line($s,$gaRaNum,$gaRa_name);
	    if ($ans ne "") {
		@outans[$nout] = $ans;
		$nout++;
#		print "$ans\n";
	    }else {
		print STDERR "entry not processed: $s\n";
		$nerr++;
	    }
	}
    }
}
print STDERR "$n lines read\n";
print STDERR "$nout lines written\n";
print STDERR "$nentry entries accepted\n";
print STDERR "$nerr entries not processed\n";
##Step2: -- sort @outans
my @sort_outans =  sort Sanskrit_order_num @outans;
##Step3: -- print @sort_outans
foreach (@sort_outans) {
    $ans = $_;
    # $ans has two substring, separated by ' : ',
    #  sort, disp.
    # for printing, change the order to disp,sort.
    my ($sort,$disp) = split(' *: *',$ans);
    my $ansout = join(' : ',($disp,$sort));
    print "$ansout\n";
}
exit;
sub process_line {
    my ($s,$gaRaNum,$gaRa_name) = @_;
    my $ans="";
    my ($fullDAtu,$senseElt,$sUtraNum,$pAWAntara,$page,$root,$marker);
    if ($s =~ /<fullDAtu>(.*?)<\/fullDAtu>/) {
	$fullDAtu = $1;
	if ($s =~ /<preverb>/) {
	    $fullDAtu =~ s/\(/[/;
	    $fullDAtu =~ s/\)/]/;
	}
    }
    if ($s =~ /<sense.*?>(.*?)<\/sense>/) {
	$senseElt = $1;
    }
    if ($s =~ /<sUtraNum>(.*?)<\/sUtraNum>/) {
	$sUtraNum = $1;
    }
    if ($s =~ /<root.*?>(.*?)<\/root>/) {
	$root = $1;
    }
    if ($s =~ /<pAWAntara>(.*?)<\/pAWAntara>/) {
	$pAWAntara = $1;
    }
    if ($s =~ /<vfttiPages>(.*?)<\/vfttiPages>/) {
	$page = $1;
    }
#    if (! ($fullDAtu  and $sUtraNum)) {
#	return $ans;
#    }
    if (not ($fullDAtu)) {
	return $ans;
    }
    if (not (sUtraNum)) {
	$sUtraNum = "";
    }
    # this ignores multiple markers
    if ($s =~ /<marker.*?>(.*?)<\/marker>/) {
	$marker = $1;
    }
    my ($pa);
    if ($pAWAntara) {
	$pa = "[pA]";
    }else {
	$pa = "";
    }
    my $sensedata = "";
    my $variant=0;
    while ($senseElt =~ /<(.*?)>(.*?)<\/.*?>/) {
	my $match = $&;
	my $elt = $1;
	my $text = $2;
	$senseElt = $'; # reset to rest of string after match
	# ignore when $elt is senseterm with certain attributes
	# Mar 12, 2009: don't ignore these (suKa, duHKa )
	$text =~ s/=/-/g; # sub-compound separator => compound separator
	if ($elt =~ /variant="yes"/) {
	    # change {} to () later.  When used () here,
	    # the sandhi program produced an error.
	    $text = "{$text}";
	    $variant = 1;
	}elsif ($elt =~ /inherited="yes"/) {
	    $text = "[$text]";
	    $variant = 1;
//	}elsif ($elt =~ /inferred="yes"/) {
	}elsif ($elt =~ /type="inferred"/) {
	    $text = "[$text]";
	    print STDERR "chk: $fullDAtu ; $text\n";
	    $variant = 1;
	}
	if ($sensedata ne ""){
	    $text = " " . $text; # separate elements with space;
	}
	$sensedata .= $text;
    }
    # in case $sensedata is still empty, use "specifier", if present
    if ($sensedata eq "") {
	# MDhVIndx has at most 1 specifier element
	if ($s =~ /<specifier>(.*?)<\/specifier>/) {
	    $sensedata = $1;
	}
    }
    
    my $sort; # string to be sorted
    my $disp; # string to be displayed;
    # a. remove preverb, prefix, space: '(xyz) aki' => 'aki';
    # b. Add '0'
    my $fullDAtu_sort = $fullDAtu;
    $fullDAtu_sort =~ s/^ *\(.*?\) *//;
    $fullDAtu_sort =~ s/^ *\[.*?\] *//;
    $fullDAtu_sort .= "0";
    # apply compound and external sandhi to sensedata
    my $sensedata_sort = $sensedata;
    # apply sandhi to sensedata
    sandhioptions("C","N","S");
    my $sensedata1 = sandhi($sensedata_sort);
    sandhioptions("E","N","S","Y");
    my $sensedata2 = sandhi($sensedata1);
    if ((0 eq 1) and ($variant eq 1) and ($sensedata ne $sensedata2)) {
	#dbg
	print STDERR "chk: $sensedata";
	if ($sensedata ne $sensedata1) {
	    print STDERR " => $sensedata1";
	}else {
	    print STDERR " == $sensedata1";
	}
	if ($sensedata1 ne $sensedata2) {
	    print STDERR " => $sensedata2";
	}else {
	    print STDERR " == $sensedata2";
	}
	print STDERR "\n";
#	print STDERR "chk: $sensedata => $sensedata1 => $sensedata2\n";
    }
    $sensedata_sort = $sensedata2;
    # revert the {} to ()
    $sensedata_sort =~ s/{/(/g;
    $sensedata_sort =~ s/}/)/g;
    my $sensedata_disp = $sensedata_sort;
    
    # remove the {} and () for sorting
    $sensedata_sort =~ s/\(.*?\)//g;
    $sensedata_sort =~ s/\[.*?\]//g;
    #for sorting, make sure gaRaNum has 2 digits, with leading 0
    my $gaRaNum_sort = sprintf('%02d',$gaRaNum);
    $sort = $fullDAtu_sort . " " . $sensedata_sort . " " .
	"$gaRaNum_sort" . " " . $pa ;
    # ejf does some other adjustments to $sort ??
    my $sort1 = $sort;
    my $fullDAtu_disp = $fullDAtu;
    $disp = $fullDAtu_disp . " ; " . 
	"$root" . " ; " .
	"$marker" . " ; " .
	"$sensedata_disp" . " ; " .
	"$gaRaNum" . " ; " .
	"$gaRa_name" . " ; " .
	"$sUtraNum" . " ; " .
	"$page" . " ; " .
	"$pa";
    $ans = $sort1 . " : " . $disp  ;
    return $ans;
}
