###########################################################################
# Shakespeare text parser
###########################################################################
# Eric M. Johnson
# July 12, 2003
#
# January 30, 2004: modified to use new database schema
#
# "Sections" = Acts
# "Chapters" = Scenes
###########################################################################
# begin timing the script
$begintime = time();
###########################################################################
# subroutine to add lines to database
###########################################################################
sub linewrite {
   $writepara = $_[0];
   $writeparanum = $_[1];
   $writeparatype = $_[2];
   $writeparasection = $_[3];
   $writeparachapter = $_[4];
  
   # identify the line type
   if ($writeparatype eq '$') { $writeparatype = 's' }   # stage directions
   if ($writeparatype eq '%') { $writeparatype = 'b' }   # blank verse -- parser can't tell difference between blank and metered verse
   if ($writeparatype eq '^') { $writeparatype = 'b' }   # blank verse -- parser can't tell difference between blank and metered verse
   # remove leading ASCII characters for stage directions, character lines, continued lines
   $writepara =~ s/[\$\%\^]//g;
   # figure out who the character is, remove his name from the line
   ($charid, $writepara, $speechcount) = charfinger($writepara, $writeparatype);
  
   # character count
   $charcount = length($writepara);
   # start by making everything lower case
   $bareline = lc($writepara);
   # strip out paragraph break string
   $bareline =~ s/\[p\]//g;
   # strip out newlines and replace with space
   $bareline =~ s/\n/ /g;
   # remove leading apostrophes
   # insert a marker, then remove the marker and the apostrophe
   $bareline =~ s/(\W')/\1APOSMARKER/g;
   $bareline =~ s/'APOSMARKER//g;
   # remove trailing apostrophes
   # insert a marker, then remove the marker and the apostrophe
   $bareline =~ s/('\W)/APOSMARKER\1/g;
   $bareline =~ s/APOSMARKER'//g;
   # replace emdashes with space
   $bareline =~ s/\-\-/ /g;
   # replace apostrophes with marker
   $bareline =~ s/'/APOSMARKER/g;
   # replace hyphens with marker
   $bareline =~ s/\-/HYPHENMARKER/g;
   # strip all non-alphanumeric characters
   $bareline =~ s/[^a-zA-Z\s]//g;
   # strip whitespace at the beginning of the line
   $bareline =~ s/^\s+//;
   # strip whitespace at the end of the line
   $bareline =~ s/[ ]*\n//;
   # strip multiple spaces
   $bareline =~ s/\s+/ /g;
   # split the line into words and count them
   @words = split(/ |\n/, $bareline);
   $wordcount = scalar(@words);
   # add to the work's wordcount
   $workwordcount = $workwordcount + $wordcount;
   # get the stems and metaphone values of each word on the line
   # first, clear the values, leaving a leading space for the stem and phonetic paragraph versions
   $stemgraph = ' ';
   $phonegraph = ' ';
   $currentword = 0;
   ###########################################################################
   # Begin processing word-by-word
   ###########################################################################
   foreach $word (@words) {
      # first, make sure we're not inserting a blank word
      if ($word ne '') {
         # increment the word count
         $currentword++;
         # remove apostrophe at beginning of word 
         $word =~ s/^APOSMARKER//g;
         # remove hyphen at end of word 
         $word =~ s/HYPHENMARKER$//g;
         # replace apostrophe and hyphen markers with real characters
         $word =~ s/APOSMARKER/'/g;
         $word =~ s/HYPHENMARKER/\-/g;
         # add the word to the wordforms hash
         $wordforms{$word}++;
         # get stem and metaphone values
         $bareword = $word;
         $bareword =~ s/[^a-z]//g; # strip unacceptable characters
        
         $stemword = Lingua::Stem::En::stem({-words => [$bareword]}) ;
         $metaphoneword = Metaphone($bareword);
         $stemgraph .= $stemword->[0] . " ";
         $phonegraph .= $metaphoneword . " ";
        
         # make sure all apostrophes will be acceptable for SQL
         $word =~ s/[']/''/g;
      }
   }
   # modify apostrophes to make it acceptable to SQL
   $writepara =~ s/\'/\'\'/g;
   # write a new line to the db
   $sqlstatement = "INSERT INTO Paragraphs (WorkID, CharID, PlainText, StemText, PhoneticText, ParagraphNum, ParagraphType, Section, Chapter, CharCount, WordCount) " .
                   "VALUES ('$currentwork', '$charid', '$writepara', '$stemgraph', '$phonegraph', $writeparanum, '$writeparatype', $writeparasection, $writeparachapter, $charcount, $wordcount)";
   if ($db->sql($sqlstatement)) {
      my(@err) = $db->Error;
      print "sql() ERROR\n";
      print "@err\n";
      die "\nDied while trying to write line $writeparanum\n$sqlstatement\n";
   }
   # increment the speech count and store it
   $speechcount++;
   $sqlstatement = "UPDATE Characters
                    SET SpeechCount=$speechcount
                    WHERE CharID = '$charid'";
   #print "$sqlstatement\n\n";
   if ($db->sql($sqlstatement)) {
      my(@err) = $db->Error;
      print "sql() ERROR\n";
      print "@err\n";
      die "\nDied while trying to update the speech count on line $writeparanum\n$sqlstatement\n";
   }
   $totalparagraphs++;
}
###########################################################################
# subroutine to figure out whose line it is, anyway
###########################################################################
sub charfinger {
   $tempcharline = $_[0];
   $tempcharparagraphtype = $_[1];
  
   if ($tempcharparagraphtype ne 's') {
      # get the chartemp value
      $pdloc = index($tempcharline, ".");
      $chartemp = substr($tempcharline, 0, $pdloc);
      $tempcharline = substr($tempcharline, $pdloc + 2);
      $charid = '';
     
      if ($chartemp eq 'xxx') {
         $charid = 'xxx';
      }
      else {
         # get character info from db
         $getcharinfo = "SELECT *
                         FROM Characters
                         WHERE Works
                         LIKE '%$currentwork%'
                           AND Abbrev='$chartemp'";
         if ($db->sql($getcharinfo)) {
            my(@err) = $db->Error;
            print "sql() ERROR\n";
            print "@err\n";
            die;
            }
            else
            {
            if ($db->FetchRow()) {
               my(%currentrow) = $db->DataHash();
               $charid = $currentrow{CharID};
               $charname = $currentrow{CharName};
               $abbrev = $currentrow{Abbrev};
               $speechcount = $currentrow{SpeechCount};
            }
            else
            {
            die "Character not found!  Died at $writeparanum\nchartemp:$chartemp\ncurrentline=$currentline\nlinecounter=$.";
            }
         }
      }
   }
   else
   {
      $charid = 'xxx'  # this is for stage direction lines
   }
   # tell it who it is, otherwise return an error
   if ($charid) {
      #print "[$textlinecount]CharID: $charid\n";
   }
   else
   {
      print "[$textlinecount]Character not identified\n";
      $noid++;
   }
   return $charid, $tempcharline, $speechcount;
}
###########################################################################
# subroutine to add new chapter
###########################################################################
sub addchapter {
   $newsection = $_[0];
   $newchapter = $_[1];
   $description = $_[2];
  
   # make apostrophes acceptable to SQL
   $description =~ s/\'/\&\#8217\;/g;
   # write new chapter to the db
   $sqlstatement = "INSERT INTO Chapters(WorkID, Section, Chapter, Description) " .
                   "VALUES ('$currentwork', $newsection, $newchapter, '$description')";
   #print "$sqlstatement\n\n";
   if ($db->sql($sqlstatement)) {
      my(@err) = $db->Error;
      print "sql() ERROR\n";
      print "@err\n";
      die "\nDied at Section $newsection, Chapter $newchapter. Check to see if stage directions are on the same line as the chapter indicator.";
   }
}
###########################################################################
# set up database connections
###########################################################################
use Win32::ODBC;
$db = new Win32::ODBC("oss");
###########################################################################
# open the language modules
###########################################################################
use Text::Metaphone;
use Lingua::Stem qw(stem);
###########################################################################
# delete all existing wordforms
###########################################################################
$sqlstatement = "DELETE From WordForms";
if ($db->sql($sqlstatement)) {
   my(@err) = $db->Error;
   print "sql() ERROR\n";
   print "@err\n";
   die "\nDied trying to delete all rows in the WordForm table";
}
###########################################################################
# variable population
###########################################################################
# populate all the Works if they are not specified on the command line
if (@ARGV) {
   @worklist = @ARGV;
}
else
{
   # get all works because no particular work was specified on the command line
   $getworks = "SELECT WorkID
                FROM Works
                ORDER BY Title";
   if ($db->sql($getworks)) {
      my(@err) = $db->Error;
      print "sql() ERROR\n";
      print "@err\n";
      die;
   }
   else
   {
      while ($db->FetchRow()) {
         my(%currentrow) = $db->DataHash();
         $worklist[$workcount] = $currentrow{WorkID};
         $workcount++;
      }
   }
   # remove the speech counts
   $sqlstatement = "UPDATE Characters
                    SET SpeechCount=0";
   #print "$sqlstatement\n\n";
   if ($db->sql($sqlstatement)) {
      my(@err) = $db->Error;
      print "sql() ERROR\n";
      print "@err\n";
      die "\nDied while trying to erase the speech counts.\n";
   }
}
# reset the workcount to zero
$totalworks = 0;
# start with Section 0, Chapter 1
$currentsection = 0;
$currentchapter = 0;
# flag for whether a line should be appended to a previous one
$appline = 0;
###########################################################################
# Main body of program
# Loop through each line, and parse according to what kind of line it is
###########################################################################
foreach $currentwork (@worklist) {
   # reset counter variables
   $noid = 0;
   $totalparagraphs = 0;
   $changelines = 0;
   $charlinecount = 0;
   $continuedlines = 0;
   $textlinecount = 1;
   $appline = 0;
   $workwordcount = 0;
   # get current work's title
   $getworkinfo = "SELECT Title
                   FROM Works
                   WHERE WorkID='$currentwork'";
   if ($db->sql($getworkinfo)) {
      my(@err) = $db->Error;
      print "sql() ERROR\n";
      print "@err\n";
      die "Could not get information about work $currentwork.";
      }
      else
      {
      while ($db->FetchRow()) {
         my(%workinfo) = $db->DataHash();
         $worktitle = $workinfo{'Title'};
      }
   }
   # start timing for this work
   $workbegintime = time();
   # delete old rows in Paragraphs table
   $sqlstatement = "DELETE * FROM Paragraphs WHERE WorkID='$currentwork'";
      print "\n------------------------------------------------\n";
      print uc($worktitle);
      print "\n------------------------------------------------\n";
      if ($db->sql($sqlstatement)) {
         my(@err) = $db->Error;
         print "sql() ERROR\n";
         print "@err\n";
         die
      }
      # delete old rows in Chapters for this play
      $sqlstatement = "DELETE * FROM Chapters WHERE WorkID='$currentwork'";
      if ($db->sql($sqlstatement)) {
         my(@err) = $db->Error;
         print "sql() ERROR\n";
         print "@err\n";
         die
      }
      $TEXTFILE = "\\oss\\texts\\parsing\\$currentwork.txt";
      open TEXTFILE or die "Can't open file $TEXTFILE\n";
      # line we're working on, if a character's line goes more than two lines
      $pendingline = '';
      $pendingparagraphnum = 0;
      foreach $currentline (
         $addline = 1;
         # get the first byte of the line, to determine what kind of line it is
         $linekind = substr($currentline, 0, 1);
         # stage direction lines
         if ($linekind eq '$') {
            $changelines++;
           
            # is this a chapter or act change?
            if (substr($currentline, 1, 7) eq "SECTION") {
               $currentsection = substr($currentline, 9, 1);
               # drop this line because it isn't needed
               $addline = 0;
            }
            if (substr($currentline, 1, 7) eq "CHAPTER") {
               # find where the period is, which is the indicator of where the scene number ends
               $periodpos = index $currentline, ".", 7;
               # figure out how many digits there are in the chapter
               $numsize = $periodpos - 9;
               $currentchapter = substr($currentline, 9, $numsize);
               # extract setting info, chomp the paragraph break
               $description = substr($currentline, 11+$numsize, length($currentline)-13);
              
               # add the chapter to the db
               addchapter($currentsection, $currentchapter, $description);
               # drop this line because it isn't needed
               $addline = 0;
            }
            if ($addline eq 1) {
               # write current line to database unless this is a section or chapter indication line
               if ($appline ne 0) {
                  linewrite($currentline, $textlinecount, $linekind, $currentsection, $currentchapter);
               }
               else
               {
                  # write pending line to database
                  linewrite($pendingline, $pendingparagraphnum, $pendinglinekind, $pendingsection, $pendingchapter);
                  # clear pending line
                  $pendingline = '';
                  $pendingparagraphnum = 0;
                  $pendinglinekind = '';
                  $pendingsection = 0;
                  $pendingchapter = 0;
                 
                  # write new line to database
                  linewrite($currentline, $textlinecount, $linekind, $currentsection, $currentchapter);
                  }
               $appline = 0;
            }
         }
         # Beginning of character lines
         if ($linekind eq '%') {
            $charlinecount++;
            if ($appline ne 0) {
               #write pending line to database
               linewrite($pendingline, $pendingparagraphnum, $pendinglinekind, $pendingsection, $pendingchapter);
               #clear old line
               $pendingline = '';
               $pendingparagraphnum = 0;
               $pendinglinekind = '';
               $pendingsection = 0;
               $pendingchapter = 0;
            }
            # populate the pending line data with the current line
            $pendingline = $currentline;
            $pendingparagraphnum = $textlinecount;
            $pendinglinekind = $linekind;
            $pendingsection = $currentsection;
            $pendingchapter = $currentchapter;
            $appline = 1;
         }
         if ($linekind eq '^') {
            $continuedlines++;
            $pendingline = "$pendingline\[p\]$currentline";
         }
	 # add the addline variable, which says whether we should increment the line count
	 $textlinecount = $textlinecount + $addline;
   }
   # write last pending line if it's still there
   if ($pendingline) {
      #write pending line to database
      linewrite($pendingline, $pendingparagraphnum, $pendinglinekind, $pendingsection, $pendingchapter);
      $textlinecount++;
   }
   # Show report data
   print "Total lines processed: " . ($textlinecount + $changelines) . "\n";
   print "   Chapter/scene change lines: $changelines\n";
   #print "   Character lines paragraphs: $charlinecount\n";
   #print "   Continued paragraphs: $continuedlines\n";
   $subtotal = $changelines + $charlinecount + $continuedlines;
   #print "Subtotal: $subtotal\n";
   # show total words, paragraphs
   print "Total words: $workwordcount\n";
   print "Total paragraphs: $totalparagraphs\n";
   # update the database with total words and total paragraphs
   $sqlstatement = "UPDATE Works
                    SET TotalWords=$workwordcount,
                        TotalParagraphs=$totalparagraphs
                    WHERE WorkID = '$currentwork'";
   #print "$sqlstatement\n\n";
   if ($db->sql($sqlstatement)) {
      my(@err) = $db->Error;
      print "sql() ERROR\n";
      print "@err\n";
      die "\nDied while trying to update the word and paragraph totals on line $writeparanum\n$sqlstatement\n";
   }
  
   # close the file that was just parsed
   close TEXTFILE;
  
   # increment the works counter
   $totalworks++;
   # end timing for this work
   $workendtime = time();
   $workexectime = $workendtime - $workbegintime;
   $minutes = int($workexectime / 60);
   $seconds = sprintf("%02d", $workexectime - ($minutes * 60));
   print "Execution time for this work $minutes:$seconds\n";
   # show cumulative timing thus far
   $cumulativetime = time() - $begintime;
   $minutes = int($cumulativetime / 60);
   $seconds = sprintf("%02d", $cumulativetime - ($minutes * 60));
   print "Cumulative execution time $minutes:$seconds\n";
}
# show the word forms, add them to db
foreach $word (sort by_count keys %wordforms) {
   #print "$word occurs $wordforms{$word} times\n";
      # start by stripping unacceptable characters
      $bareword = $word;
      $bareword =~ s/[^a-z]//g; 
      # determine the stem and phonetic value of the word
      $stemword = Lingua::Stem::En::stem({-words => [$bareword]}) ;
      $metaphoneword = Metaphone($bareword);
      # count occurences
      $occurences = $wordforms{$word};
      # make sure all apostrophes will be acceptable for SQL
      $word =~ s/[']/''/g;
      $stemword[0] =~ s/[']/''/g;
      # create a new entry in the WordForms table
      $addwordquery = "
      INSERT INTO WordForms (PlainText, PhoneticText, StemText, Occurences)
      VALUES ('$word', '$metaphoneword', '$stemword->[0]', $occurences)";
      if ($db->sql($addwordquery)) {
         my(@err) = $db->Error;
         print "sql() ERROR\n";
         print "@err\n";
         print "currentword = $currentword\n$bareline\naddwordquery=$addwordquery";
         die;
      }
}
sub by_count {
        $wordforms{$b} <=> $wordforms{$a};
}
###########################################################################
# Housecleaning
###########################################################################
# close the database connection
$db->Close();
# get the ending time and display execution time
$endtime = time();
$exectime = $endtime - $begintime;
$minutes = int($exectime / 60);
$seconds = $exectime - ($minutes * 60);
print "\n////////////////////////////////////////////////\n";
print "Works processed: $totalworks\n";
$minutes = int($exectime / 60);
$seconds = sprintf("%02d", $exectime - ($minutes * 60));
print "Total processing time $minutes:$seconds\n";
$avgtime = ($exectime / $totalworks);
$minutes = int($avgtime / 60);
$seconds = sprintf("%02d", $avgtime - ($minutes * 60));
print "Average time per work $minutes:$seconds\n"