# encoding: utf-8 =begin * Name: SiSU * Description: a framework for document structuring, publishing and search * Author: Ralph Amissah * Copyright: (C) 1997 - 2011, Ralph Amissah, All Rights Reserved. * License: GPL 3 or later: SiSU, a framework for document structuring, publishing and search Copyright (C) Ralph Amissah This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . If you have Internet connection, the latest version of the GPL should be available at these locations: * SiSU uses: * Standard SiSU markup syntax, * Standard SiSU meta-markup syntax, and the * Standard SiSU object citation numbering and system * Hompages: * Download: * Ralph Amissah ** Description: system environment, resource control and configuration details =end module SiSU_text_representation class Alter def initialize(x) if x.class==String @t_o,@s=nil,x else @t_o,@s=x,x.obj.dup end end def strip_clean_of_extra_spaces # dal output tuned @s=@s.dup @s=@s.gsub(/[ ]+([,.;:?](?:$|\s))/,'\1') unless @s =~/#{Mx[:en_a_o]}|#{Mx[:en_b_o]}/ @s=@s.gsub(/ [ ]+/,' ') @s=@s.gsub(/^ [ ]+/,'') @s=@s.gsub(/ [ ]+$/,'') @s=@s.gsub(/((?:#{Mx[:fa_bold_c]}|#{Mx[:fa_italics_c]})')[ ]+(s )/,'\1\2') @s=@s.gsub(/((?:#{Mx[:fa_bold_c]}|#{Mx[:fa_italics_c]})')[ ]+(s )/,'\1\2') end def strip_clean_of_markup # text form used in sql db search, used for digest, define rules, make same as in db clean @s=@s.dup #% same as db clean --> @s=@s.gsub(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1') @s=@s.gsub(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1') @s=@s.gsub(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1') @s=@s.gsub(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'\1') @s=@s.gsub(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'\1') @s=@s.gsub(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strke_c]}/,'\1') @s=@s.gsub(/#{Mx[:fa_superscript_o]}(\d+)#{Mx[:fa_superscript_c]}/,'[\1]') @s=@s.gsub(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1') @s=@s.gsub(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1') @s=@s.gsub(/#{Mx[:fa_hilite_o]}(.+?)#{Mx[:fa_hilite_c]}/,'\1') @s=@s.gsub(/#{Mx[:gl_o]}#(?:126|152)#{Mx[:gl_c]}/i,'~') @s=@s.gsub(/#{Mx[:en_a_o]}([\d*+]+)\s+(?:.+?)#{Mx[:en_a_c]}/,'') # endnote removed @s=@s.gsub(/#{Mx[:en_b_o]}([\d*+]+)\s+(?:.+?)#{Mx[:en_b_c]}/,'') # endnote removed @s=@s.gsub(/(?:#{Mx[:nbsp]})+/,' ') @s=@s.gsub(/(?:#{Mx[:br_nl]})+/,"\n") @s=@s.gsub(/(?:#{Mx[:br_paragraph]})+/,"\n") @s=@s.gsub(/(?:#{Mx[:br_line]})+/,"\n") @s=@s.gsub(/#{Mx[:gl_o]}(?:#lt|#060)#{Mx[:gl_c]}/,'<') @s=@s.gsub(/#{Mx[:gl_o]}(?:#gt|#062)#{Mx[:gl_c]}/,'>') @s=@s.gsub(/#{Mx[:gl_o]}#(?:038|amp)#{Mx[:gl_c]}/,'&') @s=@s.gsub(/#{Mx[:gl_o]}#033#{Mx[:gl_c]}/,'!') @s=@s.gsub(/#{Mx[:gl_o]}#035#{Mx[:gl_c]}/,'#') @s=@s.gsub(/#{Mx[:gl_o]}#042#{Mx[:gl_c]}/,'*') @s=@s.gsub(/#{Mx[:gl_o]}#045#{Mx[:gl_c]}/,'-') @s=@s.gsub(/#{Mx[:gl_o]}#047#{Mx[:gl_c]}/,'/') @s=@s.gsub(/#{Mx[:gl_o]}#095#{Mx[:gl_c]}/,'_') @s=@s.gsub(/#{Mx[:gl_o]}#123#{Mx[:gl_c]}/,'{') @s=@s.gsub(/#{Mx[:gl_o]}#125#{Mx[:gl_c]}/,'}') @s=@s.gsub(/#{Mx[:gl_o]}#126#{Mx[:gl_c]}/,'~') @s=@s.gsub(/#{Mx[:gl_o]}#169#{Mx[:gl_c]}/,'©') @s=@s.gsub(/\s\s+/,' ') @s=@s.gsub(/\s\s+/,' ') @s=@s.strip end def semi_revert_markup # used for digest, define rules, make same as in db clean if @t_o @s=@s.gsub(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'*{\1}*') @s=@s.gsub(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'/{\1}/') @s=@s.gsub(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'_{\1}_') @s=@s.gsub(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'"{\1}"') @s=@s.gsub(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'+{\1}+') @s=@s.gsub(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strke_c]}/,'-{\1}-') @s=@s.gsub(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'^{\1}^') @s=@s.gsub(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,',{\1},') @s=@s.gsub(/#{Mx[:gl_o]}#(?:126|152)#{Mx[:gl_c]}/i,'~') @s=@s.gsub(/#{Mx[:en_a_o]}([\d*+]+\s+.+?)#{Mx[:en_a_c]}/,'~{\1}~') # endnote marker marked up @s=@s.gsub(/#{Mx[:en_b_o]}([\d*+]+\s+.+?)#{Mx[:en_b_c]}/,'~[\1]~') # endnote marker marked up if @t_o.is=='heading' or @t_o.is=='para' @s=@s.gsub(/ [ ]+/,' ') @s=@s.gsub(/(?:#{Mx[:nbsp]})+/,' ') if @t_o.is=='heading' @s=@t_o.lv + '~ ' + @s end if @t_o.is=='para' if @t_o.bullet_ @s='_* ' + @s end if @t_o.indent.to_i > 0 @s="_#{@t_o.indent} " + @s @s=@s.gsub(/^(_[1-9])\s_\*\s/,'\1* ') end end end if @t_o.is=='block' \ or @t_o.is=='group' \ or @t_o.is=='code' @s=@s.gsub(/#{Mx[:nbsp]}/,' ') @s="#{@t_o.is}{\n\n#{@s}\n\n}#{@t_o.is}" @s=@s.gsub(/(?:#{Mx[:br_nl]}|\n)+/m,"\n\n") end #dealing with poem and verse calls for change in dal, where start and end verse of poem are marked as such @s=@s.strip end @s end def html_lite #test whether eventually can be used in db_import replacing shared_html_lite (search for SiSU_Format_Shared) if @t_o @s=@s.gsub(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1') @s=@s.gsub(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1') @s=@s.gsub(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1') @s=@s.gsub(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'"\1"') @s=@s.gsub(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'+{\1}+') @s=@s.gsub(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strke_c]}/,'-{\1}-') @s=@s.gsub(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1') @s=@s.gsub(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1') @s=@s.gsub(/#{Mx[:gl_o]}#(?:126|152)#{Mx[:gl_c]}/i,'~') if @t_o.is !='code' if @s =~/#{Mx[:lnk_o]}.+?#{Mx[:lnk_c]}(?:#{Mx[:url_o]}\S+?#{Mx[:url_c]}|image)/ wm=@s.scan(/#{Mx[:lnk_o]}.+?#{Mx[:lnk_c]}(?:#{Mx[:url_o]}\S+?#{Mx[:url_c]}|image)|\S+/) words=urls(wm) @s=@s.gsub(/.+/m,words) end @s=@s.gsub(/#{Mx[:gl_o]}(#[0-9]{3})#{Mx[:gl_c]}/u,'&\1;') @s=@s.gsub(/#{Mx[:gl_o]}#([a-z]{2,4})#{Mx[:gl_c]}/u,'&\1;') @s=@s.gsub(/#{Mx[:url_o]}_(\S+?)#{Mx[:url_c]}/,'\1') #http ftp matches escaped, no decoration @s=@s.gsub(/(#{Mx[:lnk_c]})#{Mx[:url_o]}(\S+?)#{Mx[:url_c]}/,'\1\2\3') #special case \{ e.g. \}http://url @s=@s.gsub(/#{Mx[:url_o]}(\S+?)#{Mx[:url_c]}/,%{#{@url_brace.xml_open}\\1#{@url_brace.xml_close}}) #http ftp matches with decoration else @s=@s.gsub(/(^|[^}])_/m,'\1>') #code-block: angle brackets special characters @s=@s.gsub(/(^|[^}])_/m,'\1>') end if @t_o.is=='paragraph' if @t_o.bullet_ @s=@s end if @t_o.indent > 0 @s=@s end end if @t_o.is=='heading' @s=@s end else p __FILE__ +':'+ __LINE__.to_s end @s end end class Modified_text_plus_Hash_digest def initialize(md,x) @md=md if x.class==String @t_o,@s=nil,x else @t_o,@s=x,x.obj.dup end @env ||=SiSU_Env::Info_env.new(@md.fns) @sha_ =((@env.digest.type =='sha256') ? true : false) @sha_ ? (require 'digest/sha2') : (require 'digest/md5') end def digest(txt) d=nil if @sha_ for hash_class in [ Digest::SHA256 ] d=hash_class.hexdigest(txt) end else for hash_class in [ Digest::MD5 ] d=hash_class.hexdigest(txt) end end d end def strip_clean_of_markup def txt SiSU_text_representation::Alter.new(@s).strip_clean_of_markup end def dgst en_dgst,img_dgst={},{} txt_dgst=digest(txt) { txt: txt, dgst_txt: txt_dgst } end self end def semi_revert_markup def txt SiSU_text_representation::Alter.new(@s).semi_revert_markup end def dgst txt_dgst=digest(txt) { txt: txt, dgst_txt: txt_dgst } end self end def composite def stripped_clean(txt) SiSU_text_representation::Alter.new(txt).strip_clean_of_markup end def markup_reverted(txt) SiSU_text_representation::Alter.new(txt).semi_revert_markup end def images(imgs) sys=SiSU_Env::System_call.new line_image=[] img_dgst={} if imgs and imgs.length > 0 @image_name,@image_dgst,@img=[],[],[] imgs.each do |i| image_source=if FileTest.file?("#{@env.path.image_source_include_local}/#{i}") @env.path.image_source_include_local elsif FileTest.file?("#{@env.path.image_source_include_remote}/#{i}") @env.path.image_source_include_remote elsif FileTest.file?("#{@env.path.image_source_include}/#{i}") @env.path.image_source_include else SiSU_Screen::Ansi.new(@md.opt.cmd,"ERROR - image:", %{"#{i}" missing}, "search locations: #{@env.path.image_source_include_local}, #{@env.path.image_source_include_remote} and #{@env.path.image_source_include}").error2 unless @md.opt.cmd =~/q/ nil end img_type = /\S+\.(png|jpg|gif)/.match(i)[1] not_found_msg='image not found' if image_source para_image = image_source + '/' + i image_name = i image_dgst =(@sha_ ? sys.sha256(para_image) : sys.md5(para_image)) else image_name = i + ' [image missing]' image_dgst = '' end line_image << { img_dgst: image_dgst[1], img_name: image_name, img_type: img_type } end end line_image end def endnotes(en) en_dgst=[] if en and en.length > 0 en.flatten.each do |e| note_no=e.gsub(/^([\d*+]+)\s+.+/,'\1') e=digest(stripped_clean(e)) note_dgst=digest(e) en_dgst << { note_number: note_no, note_dgst: note_dgst } end end en_dgst end def dgst if @t_o.of !='comment' and @t_o.of !='structure' and @t_o.of !='layout' en_dgst,img_dgst={},{} txt_stripped_dgst=digest(stripped_clean(@t_o)) txt_markup_reverted_dgst=digest(markup_reverted(@t_o)) endnotes_dgst=[] rgx_notes=/(?:#{Mx[:en_a_o]}|#{Mx[:en_b_o]})([\d*+]+\s+.+?)(?:#{Mx[:en_a_c]}|#{Mx[:en_b_c]})/ notes=@t_o.obj.scan(rgx_notes) endnotes_dgst=endnotes(notes) rgx_image=/#{Mx[:lnk_o]}(\S+\.(?:png|jpg|gif))\s.+?#{Mx[:lnk_c]}(?:#{Mx[:url_o]}\S+?#{Mx[:url_c]}|image)/ imgs=if (@t_o.is=='para' or @t_o.is=='image') \ and @t_o.obj =~rgx_image imgs=@t_o.obj.scan(rgx_image).flatten line_image=images(imgs) end dgst={ is: @t_o.is, ocn: @t_o.ocn, dgst_stripped_txt: txt_stripped_dgst, dgst_markedup_txt: txt_markup_reverted_dgst } dgst[:endnotes]=endnotes_dgst if endnotes_dgst and endnotes_dgst.length > 0 dgst[:images]=line_image if line_image and line_image.length > 0 end dgst end self end end end __END__