=begin * Name: SiSU information Structuring Universe - Structured information, Serialized Units * Author: Ralph Amissah * http://www.jus.uio.no/sisu * http://www.jus.uio.no/sisu/SiSU/download.html * Description: concordance file (html concordance, wordmap, linked index of words in document) * Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 Ralph Amissah * License: GPL 2 or later Summary of GPL 2 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA If you have Internet connection, the latest version of the GPL should be available at these locations: http://www.fsf.org/licenses/gpl.html http://www.gnu.org/copyleft/gpl.html http://www.jus.uio.no/sisu/gpl2.fsf SiSU was first released to the public on January 4th 2005 SiSU uses: * Standard SiSU markup syntax, * Standard SiSU meta-markup syntax, and the * Standard SiSU object citation numbering and system © Ralph Amissah 1997, current 2007. All Rights Reserved. * Ralph Amissah: ralph@amissah.com ralph.amissah@gmail.com =end module SiSU_Concordance require "#{SiSU_lib}/param" require "#{SiSU_lib}/sysenv" require "#{SiSU_lib}/defaults" require "#{SiSU_lib}/dal" include SiSU_Param include SiSU_Env include SiSU_Viz require "#{SiSU_lib}/html_format_css" include SiSU_HTML_Format class Source def initialize(opt) @opt=opt end def read begin @md=SiSU_Param::Parameters.new(@opt).get @env=SiSU_Env::Info_env.new(@md.fns) loc=@env.url.output_tell tool=if @md.cmd =~/[MVv]/; "#{@env.program.web_browser} #{loc}/#{@md.fnb}/#{@md.fn[:concordance]}" else '' end tell=SiSU_Screen::Ansi.new(@md.cmd,"Concordance",tool) tell.grey_title_hi unless @md.cmd =~/q/ wordmax=200000 unless @md.wc_words.nil? if @md.wc_words < wordmax SiSU_Concordance::Source::Words.new(@md).songsheet else tell=SiSU_Screen::Ansi.new(@md.cmd,"concordance skipped, large document has over #{wordmax} words (#{@md.wc_words})") tell.warn unless @md.cmd =~/q/ end else tell=SiSU_Screen::Ansi.new(@md.cmd,"wc (word count) is off, concordance will be processed for all files including those over the max set size of: #{wordmax} words") tell.warn unless @md.cmd =~/q/ SiSU_Concordance::Source::Words.new(@md).songsheet end rescue; SiSU_Errors::Info_error.new($!,$@,@md.cmd,@md.fns).error ensure end end private class Doc_title require "#{SiSU_lib}/param" include SiSU_Param include SiSU_Viz def initialize(lnk,md) @md=md @vz=SiSU_Env::Get_init.instance.skin file_array=IO.readlines(@md.fns,'') txt_path=%{#{@md.dir_out}} SiSU_Env::Info_skin.new(@md).select @md_title=@md.title @fnb=@md.fnb @env=SiSU_Env::Info_env.new @lex_button=%{SiSU home -->} @lnk=lnk @doc_details =< 

Manifest #{@md.title}

            TOC TOC - table of contents for individual articles

            Full Text Full text (with indexed table of contents)

            PDF portrait pdf version of the document (portrait)

            PDF landscape pdf version of the document (landscape)

Word index links are to html versions of the text the segmented version followed by the scroll (single document) version.
[For segmented text references [T1], [T2] or [T3] appearing without a link, indicates that the word appears in a title (or subtitle) of the text (that is identifiable by the appended object citation number).]

WOK end def create < SiSU created WordIndex for: #{@md.dc_title} #{@vz.js_head} #{@vz.js_top}
#{@vz.banner_home_button_only} #{@env.widget_static.search_form}
#@doc_details

(The word listing/index is Case sensitive: Capitalized words appear before lower case)

word (number of occurences)
linked references to word within document
[if number of occurences exceed number of references - word occurs more than once in at least one reference. Footnote/endnotes are either assigned to the paragraph from which they are referenced or ignored, so it is relevant to check the footnotes referenced from within a paragraph as well.]

(After the page is fully loaded) you can jump directly to a word by appending a hash (#) and the word to the url for this text, (do not forget that words are case sensitive, and may be listed twice (starting with and without an upper case letter)), #your_word # [ http://[web host]/#@fnb/concordance.html#your_word ]

WOK end end class Word @@word_previous='' def initialize(word,freq) @word,@freq=word,freq end def html w=if @word.capitalize==@@word_previous %{\n

#@word

(#@freq)

\n\t

} else n=@word.strip.gsub(/\s+/,'_') #also need to convert extended character set to html %{\n

#@word

(#@freq)

\n\t

} end @@word_previous=@word.capitalize w end end class Words require "#{SiSU_lib}/defaults" require "#{SiSU_lib}/param" include SiSU_Viz include SiSU_Param require "#{SiSU_lib}/html_format_css" include SiSU_HTML_Format require "#{SiSU_lib}/sysenv" include SiSU_Screen @@dp=nil def initialize(md) begin @vz=SiSU_Env::Get_init.instance.skin @md=md @env=SiSU_Env::Info_env.new(@md.fns) @path="#{@env.path.output}/#{@md.fnb}" @dal_array=SiSU_DAL::Source.new(@md).get # dal file drawn here @freq=Hash.new(0) @dp=@@dp ||=SiSU_Env::Info_env.new.digest.pattern @rxp_to=Regexp.new("<~(\\d+);(?:[oh]|[0-6]:)\\d+;\\w\\d+><#@dp:#@dp>$") @rxp_lv1=Regexp.new('^1~') #line start markers removed, ('^1~') for exceptions \n\n4{{{ @rxp_lv2=Regexp.new('^2~') @rxp_lv3=Regexp.new('^3~') @rxp_seg=Regexp.new('^4~(.+?)\s+') @rxp_title=Regexp.new('^0~title\s*(.+?)\s*$') @rxp_t1=Regexp.new('^T1') @rxp_t2=Regexp.new('^T2') @rxp_t3=Regexp.new('^T3') @rxp_excluded1=/(?:https?|ftp):\/\/\S+/mi @rxp_excluded0=/^(?:to\d+|\d+| |EOF|thumb_\S+|snap_\S+|_+|-+|ii+|iv|vi+|ix|xi+|xiv|xv|xvi+|xix|xx|\S+?_\S+|[\d_]+\w\S+|[\w\d]{1,2}|\d{1,3}\w?|#@dp|[0-9a-f]{16,64}|\d{2,3}x\d{2,3}|\S{0,2}sha\d|\S{0,3}\d{4}w\d\d|\b\w\d+|\d_all\b|e\.?g\.?)$/mi #this regex causes and cures a stack dump in ruby 1.9 !!! @rgx_scanlist=%r{(?:(?:[a-zA-Z0-9"\s]){2,7}|(?:[a-zA-Z0-9"\s]){2,7}|http://\S+)|code\{.+?\}code|<\S+?>|\w+}mi rescue; SiSU_Errors::Info_error.new($!,$@,@md.cmd,@md.fns).error end end def songsheet begin File.mkpath(@path) unless FileTest.directory?(@path) @file_index_all=File.open("#@path/#{@md.fn[:concordance]}",'w') map_para rescue; SiSU_Errors::Info_error.new($!,$@,@md.cmd,@md.fns).error ensure @file_index_all.close end end protected def location_scroll(wordlocation,show) @wordlocation=wordlocation %{#@wordlocation; } end def location_seg(wordlocation,show) @wordlocation,@show=wordlocation,show @sfx='.html' #used for hardlinks, previous setting @sfx='', web server takes care of suffix @word_location_seg=wordlocation.gsub(/(.+?)\#(\d+)/,"#{@md.fnl[:pre]}\\1#{@md.fnl[:mid]}#@sfx#{@md.fnl[:post]}#\\2") unless wordlocation.nil? case @wordlocation when @rxp_t1 %{[H]#@show, } when @rxp_t2 %{[H]#@show, } when @rxp_t3 %{[H]#@show, } else %{#@show, } end end def map_para @seg,toy=nil,nil @word_map={} @dal_array.each do |line| if line !~/<~(\d+);[um]\d+;\w\d+><#@dp:#@dp>$/ # lines to ignore: # are added but not part of authors substantive text; 0 are mostly machine generated if line =~@rxp_seg; @seg=line[@rxp_seg,1] end if line =~@rxp_to; toy=line[@rxp_to,1] end if toy =~/\d+/ and toy !~/^0$/ for word in line.scan(@rgx_scanlist) #%take in word or other match word=nil if word =~@rxp_excluded0 #watch word=nil if word =~@rxp_excluded1 #watch if word #word.gsub!(/<\/?[i]>/,'') word.gsub!(/<\/?\S+?>/,'') word.strip! word.gsub!(/[\.,;:"]$/,'') word.gsub!(/["]/,'') word.gsub!(/^\s*[\(]/,'') word.gsub!(/[\(]\s*$/,'') word.gsub!(/^(?:See|e\.?g\.?).+/,'') word.gsub!(/^\s*[.,;:]\s*/,'') word.strip! word.gsub!(/^\d+(st|nd|rd|th)$/,'') word.gsub!(/^(\d+\.?)+$/, '') word=nil if word =~/^\s*$/ #watch if word word.capitalize! unless word =~/[A-Z][A-Z]/ or word =~/\w+\s\w+/ #word.downcase! if word =~lesser #word.capitalize! if word =~greater @freq[word] +=1 @word_map[word] ||= [] if line !~@rxp_lv1 and line !~@rxp_lv2 and line !~@rxp_lv3 @word_map[word] << location_seg("#@seg\##{toy}",toy) else @word_map[word] << case line when @rxp_lv1; location_seg('T1',toy) when @rxp_lv2; location_seg('T2',toy) when @rxp_lv3; location_seg('T3',toy) end end end end end end end end scr='Full Text scroll: doc#  ' seg='' @file_index_all << SiSU_Concordance::Source::Doc_title.new('toc',@md).create for word in @freq.keys.sort! {|a,b| a.downcase<=>b.downcase} keyword=SiSU_Concordance::Source::Word.new(word,@freq[word]).html if keyword !~ @rxp_excluded0 if @word_map[word][0] =~ /\d+/ wm=[] @file_index_all << %{#{keyword}#{seg}#{@word_map[word].uniq.compact.join}} end @file_index_all << '

' end # special cases endnotes and header levels 1 - 3 end credits=@vz.credits_splash @file_index_all << "#{credits}\n" # footer tell=SiSU_Screen::Ansi.new(@md.cmd,@md.fns,"#{@env.path.output_tell}/#{@md.fn[:concordance]}") tell.flow if @md.cmd =~/[MV]/ end end end end __END__