From d29a3e5469d8468084641c385ebf16948f7c2437 Mon Sep 17 00:00:00 2001 From: Ralph Amissah Date: Tue, 22 Jul 2008 20:00:59 -0400 Subject: sisu-0.68.0 proposed * middle layer document representation changed, (accounting for substantial patch) * texpdf multiple document sizes as specified in config * numerous small fixes [should on the whole be easier to maintain] --- lib/sisu/v0/concordance.rb | 49 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 13 deletions(-) (limited to 'lib/sisu/v0/concordance.rb') diff --git a/lib/sisu/v0/concordance.rb b/lib/sisu/v0/concordance.rb index 1b777bb5..f62b20ac 100644 --- a/lib/sisu/v0/concordance.rb +++ b/lib/sisu/v0/concordance.rb @@ -183,18 +183,18 @@ WOK @path="#{@env.path.output}/#{@md.fnb}" @freq=Hash.new(0) @dp=@@dp ||=SiSU_Env::Info_env.new.digest.pattern - @rxp_to=Regexp.new("<~(\\d+);(?:[oh]|[0-6]:)\\d+;\\w\\d+><#@dp:#@dp>$") - @rxp_lv1=Regexp.new('^1~') #line start markers removed, ('^1~') for exceptions \n\n4{{{ - @rxp_lv2=Regexp.new('^2~') - @rxp_lv3=Regexp.new('^3~') - @rxp_seg=Regexp.new('^4~(.+?)\s+') - @rxp_title=Regexp.new('^0~title\s*(.+?)\s*$') + @rxp_to=Regexp.new("#{Mx[:id_o]}~(\\d+);(?:[oh]|[0-6]:)\\d+;\\w\\d+#{Mx[:id_c]}#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]}|#{Mx[:id_o]}\S+?#{Mx[:id_c]}$") + @rxp_lv1=/^#{Mx[:lv_o]}1:/ + @rxp_lv2=/^#{Mx[:lv_o]}2:/ + @rxp_lv3=/^#{Mx[:lv_o]}3:/ + @rxp_seg=/^#{Mx[:lv_o]}4:(\S+?)#{Mx[:lv_c]}/ + @rxp_title=Regexp.new("^#{Mx[:meta_o]}title#{Mx[:meta_c]}\s*(.+?)\s*$") @rxp_t1=Regexp.new('^T1') @rxp_t2=Regexp.new('^T2') @rxp_t3=Regexp.new('^T3') - @rxp_excluded1=/(?:https?|file|ftp):\/\/\S+/mi - @rxp_excluded0=/^(?:to\d+|\d+| |EOF|thumb_\S+|snap_\S+|_+|-+|ii+|iv|vi+|ix|xi+|xiv|xv|xvi+|xix|xx|\S+?_\S+|[\d_]+\w\S+|[\w\d]{1,2}|\d{1,3}\w?|#@dp|[0-9a-f]{16,64}|\d{2,3}x\d{2,3}|\S{0,2}sha\d|\S{0,3}\d{4}w\d\d|\b\w\d+|\d_all\b|e\.?g\.?)$/mi #this regex causes and cures a stack dump in ruby 1.9 !!! - @rgx_scanlist=%r{(?:(?:[a-zA-Z0-9"\s]){2,7}|(?:[a-zA-Z0-9"\s]){2,7}|(?:https?|file)://\S+)|code\{.+?\}code|<\S+?>|\w+}mi + @rxp_excluded1=/(?:https?|file|ftp):\/\/\S+/ + @rxp_excluded0=/^(?:#{Mx[:fa_bold_o]}|#{Mx[:fa_italics_o]})?(?:to\d+|\d+| |#{Mx[:br_endnotes]}|EOF|#{Mx[:br_eof]}|thumb_\S+|snap_\S+|_+|-+|[(]?(?:ii+|iv|vi+|ix|xi+|xiv|xv|xvi+|xix|xx)[).]?|\S+?_\S+|[\d_]+\w\S+|[\w\d]{1,2}|\d{1,3}\w?|#@dp|[0-9a-f]{16,64}|\d{2,3}x\d{2,3}|\S{0,2}sha\d|\S{0,3}\d{4}w\d\d|\b\w\d+|\d_all\b|e\.?g\.?)(?:#{Mx[:fa_bold_c]}|#{Mx[:fa_italics_c]})?$/mi #this regex causes and cures a stack dump in ruby 1.9 !!! + @rgx_scanlist=%r{#{Mx[:fa_italics_o]}[a-zA-Z0-9"\s]{2,12}#{Mx[:fa_italics_c]}|#{Mx[:fa_bold_o]}[a-zA-Z0-9"\s]{2,12}#{Mx[:fa_bold_c]}|(?:https?|file)://\S+|#{Mx[:gr_o]}code#{Mx[:gr_o]}.+?#{Mx[:gr_o]}code-end#{Mx[:gr_o]}|<\S+?>|#{Mx[:id_o]}\S+?#{Mx[:id_c]}|\w+|[a-zA-Z]+}mi rescue; SiSU_Errors::Info_error.new($!,$@,@md.cmd,@md.fns).error end end @@ -231,20 +231,39 @@ WOK @seg,toy=nil,nil @word_map={} @dal_array.each do |line| - if line !~/<~(\d+);[um]\d+;\w\d+><#@dp:#@dp>$/ # lines to ignore: # are added but not part of authors substantive text; 0 are mostly machine generated - if line =~@rxp_seg; @seg=line[@rxp_seg,1] + if line !~/#{Mx[:id_o]}~(\d+);[um]\d+;\w\d+#{Mx[:id_c]}#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]}$/ #lines to ignore: # are added but not part of authors substantive text; 0 are mostly machine generated + if line =~@rxp_seg; @seg=line[@rxp_seg,1] end - if line =~@rxp_to; toy=line[@rxp_to,1] + if line =~@rxp_to; toy=line[@rxp_to,1] end if toy =~/\d+/ \ and toy !~/^0$/ for word in line.scan(@rgx_scanlist) #%take in word or other match + #word.gsub!(@rxp_clean,'') + word.gsub!(/#{Mx[:fa_o]}\S+?#{Mx[:fa_o_c]}/,'') + word.gsub!(/#{Mx[:fa_c_o]}\S+?#{Mx[:fa_c]}/,'') + word.gsub!(/#{Mx[:gl_o]}#[a-z]+#{Mx[:gl_c]}/,'') + word.gsub!(/#{Mx[:gl_o]}#[0-9]+#{Mx[:gl_c]}/,'') + word.gsub!(/^\S$/,'') + word=nil if word.empty? word=nil if word =~@rxp_excluded0 #watch word=nil if word =~@rxp_excluded1 #watch + word=nil if word =~/^\S$/ if word - #word.gsub!(/<\/?[i]>/,'') + word.gsub!(/#{Mx[:br_nl]}|#{Mx[:br_line]}/,' ') + word.gsub!(/#{Mx[:lv_o]}\d:\S*?#{Mx[:lv_c]}/,'') + word.gsub!(/#{Mx[:pa_o]}:i\d#{Mx[:pa_c]}/,'') + word.gsub!(/#{Mx[:id_o]}~\d+;\S+?#{Mx[:id_c]}/,'') + word.gsub!(/#{Mx[:fa_o]}[a-z]{1,7}#{Mx[:fa_o_c]}|#{Mx[:fa_c_o]}[a-z]{1,7}#{Mx[:fa_c]}/,'') + word.gsub!(/#{Mx[:mk_o]}(?:[0-9a-f]{32}:[0-9a-f]{32}|[0-9a-f]{64}:[0-9a-f]{64})#{Mx[:mk_c]}/,'') + word.gsub!(/#{Mx[:mk_o]}(?:[0-9a-f]{32}|[0-9a-f]{64})#{Mx[:mk_c]}/,'') + word.gsub!(/#{Mx[:en_a_o]}(?:\d|[*+])*|#{Mx[:en_b_o]}(?:\d|[*+])*|#{Mx[:en_a_c]}|#{Mx[:en_b_c]}/mi,'') + word.gsub!(/#{Mx[:fa_o]}\S+?#{Mx[:fa_o_c]}/,''); word.gsub!(/#{Mx[:fa_c_o]}\S+?#{Mx[:fa_c]}/,'') + #word.gsub!(/#{Mx[:fa_o]}\S+?#{Mx[:fa_c]}/,'') #watch word.gsub!(/<\/?\S+?>/,'') + word.gsub!(/^\@+/,'') word.strip! + word.gsub!(/#{Mx[:tc_p]}.+/,'') word.gsub!(/[\.,;:"]$/,'') word.gsub!(/["]/,'') word.gsub!(/^\s*[\(]/,'') @@ -252,8 +271,12 @@ WOK word.gsub!(/^(?:See|e\.?g\.?).+/,'') word.gsub!(/^\s*[.,;:]\s*/,'') word.strip! + word.gsub!(/^\(?[a-zA-Z]\)$/,'') word.gsub!(/^\d+(st|nd|rd|th)$/,'') word.gsub!(/^(\d+\.?)+$/, '') + word.gsub(/#{Mx[:mk_o]}|#{Mx[:mk_c]}/,'') + word.gsub!(/^\S$/,'') + word=nil if word =~/^\S$/ word=nil if word =~/^\s*$/ #watch if word unless word =~/[A-Z][A-Z]/ \ -- cgit v1.2.3