From d29a3e5469d8468084641c385ebf16948f7c2437 Mon Sep 17 00:00:00 2001
From: Ralph Amissah <ralph@amissah.com>
Date: Tue, 22 Jul 2008 20:00:59 -0400
Subject: sisu-0.68.0 proposed * middle layer document representation changed,
 (accounting for substantial patch) * texpdf multiple document sizes as
 specified in config * numerous small fixes [should on the whole be easier to
 maintain]

---
 lib/sisu/v0/concordance.rb | 49 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 36 insertions(+), 13 deletions(-)

(limited to 'lib/sisu/v0/concordance.rb')
diff --git a/lib/sisu/v0/concordance.rb b/lib/sisu/v0/concordance.rb
index 1b777bb5..f62b20ac 100644
--- a/lib/sisu/v0/concordance.rb
+++ b/lib/sisu/v0/concordance.rb
@@ -183,18 +183,18 @@ WOK
           @path="#{@env.path.output}/#{@md.fnb}"
           @freq=Hash.new(0)
           @dp=@@dp ||=SiSU_Env::Info_env.new.digest.pattern
-          @rxp_to=Regexp.new("<~(\\d+);(?:[oh]|[0-6]:)\\d+;\\w\\d+><#@dp:#@dp>$")
-          @rxp_lv1=Regexp.new('^1~') #line start markers removed, ('^1~') for exceptions <!pn!>\n\n4{{{
-          @rxp_lv2=Regexp.new('^2~')
-          @rxp_lv3=Regexp.new('^3~')
-          @rxp_seg=Regexp.new('^4~(.+?)\s+')
-          @rxp_title=Regexp.new('^0~title\s*(.+?)\s*$')
+          @rxp_to=Regexp.new("#{Mx[:id_o]}~(\\d+);(?:[oh]|[0-6]:)\\d+;\\w\\d+#{Mx[:id_c]}#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]}|#{Mx[:id_o]}\S+?#{Mx[:id_c]}$")
+          @rxp_lv1=/^#{Mx[:lv_o]}1:/
+          @rxp_lv2=/^#{Mx[:lv_o]}2:/
+          @rxp_lv3=/^#{Mx[:lv_o]}3:/
+          @rxp_seg=/^#{Mx[:lv_o]}4:(\S+?)#{Mx[:lv_c]}/
+          @rxp_title=Regexp.new("^#{Mx[:meta_o]}title#{Mx[:meta_c]}\s*(.+?)\s*$")
           @rxp_t1=Regexp.new('^T1')
           @rxp_t2=Regexp.new('^T2')
           @rxp_t3=Regexp.new('^T3')
-          @rxp_excluded1=/(?:https?|file|ftp):\/\/\S+/mi
-          @rxp_excluded0=/^(?:to\d+|\d+|&nbsp;|EOF|thumb_\S+|snap_\S+|_+|-+|ii+|iv|vi+|ix|xi+|xiv|xv|xvi+|xix|xx|\S+?_\S+|[\d_]+\w\S+|[\w\d]{1,2}|\d{1,3}\w?|#@dp|[0-9a-f]{16,64}|\d{2,3}x\d{2,3}|\S{0,2}sha\d|\S{0,3}\d{4}w\d\d|\b\w\d+|\d_all\b|e\.?g\.?)$/mi #this regex causes and cures a stack dump in ruby 1.9 !!!
-          @rgx_scanlist=%r{(?:<i>(?:[a-zA-Z0-9"\s]){2,7}</i>|<b>(?:[a-zA-Z0-9"\s]){2,7}</b>|(?:https?|file)://\S+)|code\{.+?\}code|<\S+?>|\w+}mi
+          @rxp_excluded1=/(?:https?|file|ftp):\/\/\S+/
+          @rxp_excluded0=/^(?:#{Mx[:fa_bold_o]}|#{Mx[:fa_italics_o]})?(?:to\d+|\d+|&nbsp;|#{Mx[:br_endnotes]}|EOF|#{Mx[:br_eof]}|thumb_\S+|snap_\S+|_+|-+|[(]?(?:ii+|iv|vi+|ix|xi+|xiv|xv|xvi+|xix|xx)[).]?|\S+?_\S+|[\d_]+\w\S+|[\w\d]{1,2}|\d{1,3}\w?|#@dp|[0-9a-f]{16,64}|\d{2,3}x\d{2,3}|\S{0,2}sha\d|\S{0,3}\d{4}w\d\d|\b\w\d+|\d_all\b|e\.?g\.?)(?:#{Mx[:fa_bold_c]}|#{Mx[:fa_italics_c]})?$/mi #this regex causes and cures a stack dump in ruby 1.9 !!!
+          @rgx_scanlist=%r{#{Mx[:fa_italics_o]}[a-zA-Z0-9"\s]{2,12}#{Mx[:fa_italics_c]}|#{Mx[:fa_bold_o]}[a-zA-Z0-9"\s]{2,12}#{Mx[:fa_bold_c]}|(?:https?|file)://\S+|#{Mx[:gr_o]}code#{Mx[:gr_o]}.+?#{Mx[:gr_o]}code-end#{Mx[:gr_o]}|<\S+?>|#{Mx[:id_o]}\S+?#{Mx[:id_c]}|\w+|[a-zA-Z]+}mi
         rescue; SiSU_Errors::Info_error.new($!,$@,@md.cmd,@md.fns).error
         end
       end
@@ -231,20 +231,39 @@ WOK
         @seg,toy=nil,nil
         @word_map={}
         @dal_array.each do |line|
-          if line !~/<~(\d+);[um]\d+;\w\d+><#@dp:#@dp>$/ # lines to ignore: # are added but not part of authors substantive text; 0 are mostly machine generated
-            if line =~@rxp_seg;   @seg=line[@rxp_seg,1]
+          if line !~/#{Mx[:id_o]}~(\d+);[um]\d+;\w\d+#{Mx[:id_c]}#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]}$/ #lines to ignore: # are added but not part of authors substantive text; 0 are mostly machine generated
+            if line =~@rxp_seg; @seg=line[@rxp_seg,1]
             end
-            if line =~@rxp_to;    toy=line[@rxp_to,1]
+            if line =~@rxp_to;  toy=line[@rxp_to,1]
             end
             if toy =~/\d+/ \
             and toy !~/^0$/
               for word in line.scan(@rgx_scanlist) #%take in word or other match
+                #word.gsub!(@rxp_clean,'')
+                word.gsub!(/#{Mx[:fa_o]}\S+?#{Mx[:fa_o_c]}/,'')
+                word.gsub!(/#{Mx[:fa_c_o]}\S+?#{Mx[:fa_c]}/,'')
+                word.gsub!(/#{Mx[:gl_o]}#[a-z]+#{Mx[:gl_c]}/,'')
+                word.gsub!(/#{Mx[:gl_o]}#[0-9]+#{Mx[:gl_c]}/,'')
+                word.gsub!(/^\S$/,'')
+                word=nil if word.empty?
                 word=nil if word =~@rxp_excluded0 #watch
                 word=nil if word =~@rxp_excluded1 #watch
+                word=nil if word =~/^\S$/
                 if word
-                  #word.gsub!(/<\/?[i]>/,'')
+                  word.gsub!(/#{Mx[:br_nl]}|#{Mx[:br_line]}/,' ')
+                  word.gsub!(/#{Mx[:lv_o]}\d:\S*?#{Mx[:lv_c]}/,'')
+                  word.gsub!(/#{Mx[:pa_o]}:i\d#{Mx[:pa_c]}/,'')
+                  word.gsub!(/#{Mx[:id_o]}~\d+;\S+?#{Mx[:id_c]}/,'')
+                  word.gsub!(/#{Mx[:fa_o]}[a-z]{1,7}#{Mx[:fa_o_c]}|#{Mx[:fa_c_o]}[a-z]{1,7}#{Mx[:fa_c]}/,'')
+                  word.gsub!(/#{Mx[:mk_o]}(?:[0-9a-f]{32}:[0-9a-f]{32}|[0-9a-f]{64}:[0-9a-f]{64})#{Mx[:mk_c]}/,'')
+                  word.gsub!(/#{Mx[:mk_o]}(?:[0-9a-f]{32}|[0-9a-f]{64})#{Mx[:mk_c]}/,'')
+                  word.gsub!(/#{Mx[:en_a_o]}(?:\d|[*+])*|#{Mx[:en_b_o]}(?:\d|[*+])*|#{Mx[:en_a_c]}|#{Mx[:en_b_c]}/mi,'')
+                  word.gsub!(/#{Mx[:fa_o]}\S+?#{Mx[:fa_o_c]}/,''); word.gsub!(/#{Mx[:fa_c_o]}\S+?#{Mx[:fa_c]}/,'')
+                  #word.gsub!(/#{Mx[:fa_o]}\S+?#{Mx[:fa_c]}/,'') #watch
                   word.gsub!(/<\/?\S+?>/,'')
+                  word.gsub!(/^\@+/,'')
                   word.strip!
+                  word.gsub!(/#{Mx[:tc_p]}.+/,'')
                   word.gsub!(/[\.,;:"]$/,'')
                   word.gsub!(/["]/,'')
                   word.gsub!(/^\s*[\(]/,'')
@@ -252,8 +271,12 @@ WOK
                   word.gsub!(/^(?:See|e\.?g\.?).+/,'')
                   word.gsub!(/^\s*[.,;:]\s*/,'')
                   word.strip!
+                  word.gsub!(/^\(?[a-zA-Z]\)$/,'')
                   word.gsub!(/^\d+(st|nd|rd|th)$/,'')
                   word.gsub!(/^(\d+\.?)+$/, '')
+                  word.gsub(/#{Mx[:mk_o]}|#{Mx[:mk_c]}/,'')
+                  word.gsub!(/^\S$/,'')
+                  word=nil if word =~/^\S$/
                   word=nil if word =~/^\s*$/ #watch
                   if word
                     unless word =~/[A-Z][A-Z]/ \
-- 
cgit v1.2.3