=begin * Name: SiSU information Structuring Universe - Structured information, Serialized Units * Author: Ralph Amissah * http://www.jus.uio.no/sisu * http://www.jus.uio.no/sisu/SiSU/download.html * Description: preprocessing, (document abstraction), data abstraction used in subsequent processing * Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 Ralph Amissah * License: GPL 2 or later Summary of GPL 2 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA If you have Internet connection, the latest version of the GPL should be available at these locations: http://www.fsf.org/licenses/gpl.html http://www.gnu.org/copyleft/gpl.html http://www.jus.uio.no/sisu/gpl2.fsf SiSU was first released to the public on January 4th 2005 SiSU uses: * Standard SiSU markup syntax, * Standard SiSU meta-markup syntax, and the * Standard SiSU object citation numbering and system © Ralph Amissah 1997, current 2007. All Rights Reserved. * Ralph Amissah: ralph@amissah.com ralph.amissah@gmail.com =end module SiSU_DAL require "#{SiSU_lib}/defaults" require "#{SiSU_lib}/sysenv" require "#{SiSU_lib}/param" require "#{SiSU_lib}/dal_syntax" require "#{SiSU_lib}/dal_doc_str" require "#{SiSU_lib}/i18n" include SiSU_Env include SiSU_Param include SiSU_Viz include Syntax class Instantiate < SiSU_Param::Parameters::Instructions def initialize @@flag_vocab=0 @@endnote={} @@endnote_array=@@word_mode=[] @@endnote_counter,@@endnote_counter_asterisk,@@endnote_counter_dag=1,1,1 #added @@line_mode='' end end class Source #{@my_make_fns.meta}") if @md.cmd =~/M/ tell.txt_grey unless @md.cmd =~/q/ dal.each{|s| dal_array << "#{s.strip}\n\n" unless s.strip.empty?} dal_array end def read_fnm dal=[] dal=if FileTest.file?(@fnm); File.open(@fnm){ |f| dal=Marshal.load(f)} else SiSU_DAL::Source.new(@opt).create_dal end end end class Output def initialize(md,data) @md,@data=md,data @my_make=SiSU_Env::Create_file.new(@md.cmd,@md.fns) dir=SiSU_Env::Info_env.new(@md.fns) @hard="#{dir.path.dal}/#{@md.fns}.meta" end def hard_output if @md.cmd =~/M/ filename_meta=@my_make.file_meta @data.each {|s| filename_meta.puts s.strip + "\n\n" unless s.strip.empty?} else File.unlink(@hard) if FileTest.file?(@hard) end end def marshal marshal_meta=@my_make.marshal_meta File.open(marshal_meta,'w'){|f| Marshal.dump(@data.to_a,f)} end end class Make @@endnote={} @@endnote_array=@@word_mode=[] @@endnote_counter,@@endnote_counter_asterisk,@@endnote_counter_dag=1,1,1 @@comment='%' @@dp=nil def initialize(md,data) @md,@data=md,data @@word_mode=[] @env=SiSU_Env::Info_env.new(@md.fns) @skin=SiSU_Env::Info_skin.new(@md) @dp=@@dp ||=SiSU_Env::Info_env.new.digest.pattern l=SiSU_Env::Standardise_language.new.file_to_language(@md.fns) @language=l[:l] @tr=SiSU_Translate::Source.new(@md,@language) end def reset @@flag_vocab=0 @@endnote={} @@endnote_array=@@word_mode=[] @@endnote_counter,@@endnote_counter_asterisk,@@endnote_counter_dag=1,1,1 @@line_mode='' end def song reset data=@data @metafile="#{@env.path.dal}/#{@md.fns}.meta" my_make_source_file=SiSU_Env::Create_file.new(@md.cmd,@md.fns) data=data.join.split("\n\n") data=SiSU_document_structure::Code.new(@md,data).code data_new=[] data.each do |x| data_new << if x =~ /\n\n/m; x.split(/\n\n+/) else x end end data=data_new.flatten data=substitutions_and_insertions?(data) data=Syntax::Markup.new(@md,data).songsheet data=character_check(data) data=images(data) data=SiSU_document_structure::Tables.new(@md,data).tables data=numbering_song(data) #tr issue data=endnotes(data) data=object_digest(data) meta=metadata(data) outputdata=data + meta if @md.cmd =~/[mM]/ SiSU_DAL::Output.new(@md,outputdata).hard_output SiSU_DAL::Output.new(@md,outputdata).marshal end reset outputdata end protected def character_check(data) require 'iconv' reset @tuned_file=[] endnote_no=1 data.each do |para| para.strip! para.gsub!(/^([12])~\?\s+/,'\1~ ') #conditional header for incorporated document 2004w12 para.gsub!(/^[{~}]\s*$/,'') para.gsub!(/^#{@@comment}.*/,'') #remove comment and divider #% para.gsub!(/<~#>|~#\s*/,'<~#>') para.gsub!(/-#\s*/,'<-#><~#>') #para.gsub!(/(#\{{3} arch-tag:|0\{{3}~cvs)\s+/, "0{{~rcs ") #KEEP ... ENABLE WIDER USE OF REVISION CONTROL para.gsub!(/(~\{ )\s+/,'\1') para.gsub!(/ \/\//,'
') #added 2004w29 para.gsub!(/
/,'
') #needed by xml, xhtml etc. #para.gsub!(/

/,'

') #consider para.gsub!(/`/,"'") para.gsub!(/\342\200\231/,"'") #if para =~/’/ #Avoid #‘ ’ #“ ” para.gsub!(/\t/,' ') para.gsub!(/�/,' ') #watch, replace with char code para.gsub!(/[“”]/,'""') para.gsub!(/[­–—]/,'-') #— – chk para.gsub!(/·/,'*') para.gsub!(/\\copy(?:right)?\b/,'©') para.gsub!(/\\trademark\b|\\tm\b/,'®') #non_utf8(para) para=para + "\n" case para when /\^~/ # endnotes #% Note must do this first (earlier loop) and then enter gathered data into ~^\d+ sub_para=para.dup @@endnote_array << sub_para.gsub!(/\n/,'').gsub!(/\^~\s+(.+)\s*/, %{~\{#{endnote_no} \\1 \}~}).strip endnote_no+=1 para=nil if para =~/\^~ .+/ #removes 'binary' endnote now in endnote array for later insertion end @tuned_file << para unless para.nil? end @tuned_file=@tuned_file.flatten end def images(data) tuned_file=[] @rmgk=false if SiSU_Env::Info_settings.new.program?('rmagick'); @rmgk=SiSU_Env::Load.new('RMagick').prog else tell=SiSU_Screen::Ansi.new(@md.cmd,'use of RMagick is not enabled in sisurc.yml') tell.warn if @md.cmd =~/[vVM]/ end data.each do |para| para.strip! if para =~/\{\s*\S+\.(?:png|jpg|gif)(?:\s*|\s+.+)?\}(?:(?:https?|ftp):\S+|image)/ if para !~/\{\s*\S+\.(?:png|jpg|gif)\s+\d+x\d+\s+/ m=/\{\s*(\S+\.(?:png|jpg|gif))/ if @rmgk imgs=para.scan(m).flatten images=imgs.each do |image| dir=SiSU_Env::Info_env.new(@md.fns) path_image=[dir.path.image_source_local_tex,dir.path.image_source_remote_tex,dir.path.image_source_tex] image_path=nil path_image.each do |image_path| break if FileTest.exist?("#{image_path}/#{image}") end if FileTest.exist?("#{image_path}/#{image}") img=Magick::ImageList.new("#{image_path}/#{image}") img_col,img_row=img.columns,img.rows if img_col > img_row #landscape if img_col> 640 #480 img_col=640 #480 img_row=((1.00*img_col/img.columns)*img.rows).round end else #portrait if img_col> 640 #480 img_col=640 #480 img_row=((1.00*img_col/img.columns)*img.rows).round end if img_row > 640 img_row=640 img_col=((1.00*img_row/img.rows)*img.columns).round end end para.gsub!(/(#{image})/,"#{image} #{img_col}x#{img_row}") else para.gsub!(/\{\s*(\S+)\.(png|jpg|gif).+?\}((?:https?|ftp):\S+|image)/,'[ \1 (\2 missing) ]') end end else images=para.scan(m) do |image| tell=SiSU_Screen::Ansi.new(@md.cmd,'where image dimensions have not been provided RMagick is required',image) tell.warn #unless @opt.cmd =~/q/ end end end end para.gsub!(/\{\s+(\S+\.(?:png|jpg|gif))\s+/i,'{\1 ') if para =~/\{\s+\S+\.(?:png|jpg|gif).+?\}(?:(?:https?|ftp):\S+|image)/ tuned_file << para unless para.nil? end tuned_file end def output_filetypes_in_cmd(cmd_shortcut,source=nil) #make list of file types in shortcut command (as configured), e.g. when sisu -3 is used cf_defaults=SiSU_Env::Info_processing_flag.new cmd_list=case cmd_shortcut.to_s when /0/; cf_defaults.cf_0 when /1/; cf_defaults.cf_1 when /2/; cf_defaults.cf_2 when /3/; cf_defaults.cf_3 when /4/; cf_defaults.cf_4 when /5/; cf_defaults.cf_5 end file_type_names=[] file_type_names <<= if cmd_list =~ /y/; 'sisu_manifest.html' end file_type_names <<= if cmd_list =~ /h/; ['toc.html', 'doc.html'] end file_type_names <<= if cmd_list =~ /p/; ['landscape.pdf', 'portrait.pdf'] end file_type_names <<= if cmd_list =~ /o/; 'opendocument.odt' end file_type_names <<= if cmd_list =~ /b/; 'scroll.xhtml' end file_type_names <<= if cmd_list =~ /x/; 'sax.xml' end file_type_names <<= if cmd_list =~ /X/; 'dom.xml' end file_type_names <<= if cmd_list =~ /a/; 'plain.txt' end file_type_names <<= if cmd_list =~ /g/; 'wiki.txt' end file_type_names <<= if cmd_list =~ /w/; 'concordance.html' end file_type_names <<= if cmd_list =~ /N/; 'digest.txt' end file_type_names <<= if source and cmd_shortcut =~ /s/; source end file_type_names <<= if cmd_shortcut =~ /S/; 'sisupod.zip' end file_type_names=file_type_names.flatten end def substitutions_and_insertions?(data) tuned_file=[] if data[0] =~ /^#!\s*(?:\/usr\/bin\/env sisu|\/usr\/bin\/sisu)/ # remove bang from top #! (however file is stripped, so will be removed provided no content preceeds it) data[0].gsub!(/^#!\s*\/usr\/bin\/sisu/,'') data[0].gsub!(/^#!\s*\/usr\/bin\/env sisu/,'') end if data[0] =~ /^(SiSU\s+[\d.]*|sisu-[\d.]+)$/ # SiSU identifier data[0].gsub!(/^(SiSU\s*[\d.]*)$/,'% \1') data[0].gsub!(/^(sisu-[\d.]+)$/,'% \1') end data.each do |para| para=if @md.markup_version.to_f >= 0.38 SiSU_document_structure::Structure.new(@md,para).structure_markup_normalize else para end #para.gsub!(//,'\1') #consider, would permit use of text hyperlinks if desired, dal_syntax more appropriate? para.gsub!(/^((?:[1-9]|:?[A-C])~\S*)\s*$/,'\1~ [Note: heading marker::required title missing]~#') #conditional header for incorporated document 2004w12 if para =~/^@\S+?:/ para.gsub!(/^@(\S+?):\s+/,'0~\1 ') para.gsub!(/^@(\S+?):([+-])\s+/,'0~\1\2 ') end if para !~/^%+\s/ and para =~/^(?:_\*\s+)?\{(?:~\^\s+)?(.+?)\s\[(?:\d(?:[sS]+))\]\}(?:\.\.\/\S+?\/|\S+?\.(?:sst|ssm)\b)(?:\s+~\{.+?\}~)?(?:\s+\*~\S+)*\s*$/ txt,cmd,source,url_dir,note,manifest=nil,nil,nil,nil,nil,nil url_and_stub=SiSU_Env::Info_env.new.url if defined? url_and_stub.remote @output_url="#{url_and_stub.remote}" if para =~/\{(.+?)\s\[(\d[sS]*)\]\}((\S+?)\.ss[tm])(\s+~\{.+?\}~)?/ #syntax e.g.: { "Sphinx or Robot", Leena Krohn [3sS]}sphinx_or_robot.leena_krohn.1996.sst txt,cmd,source,url_dir,note=$1,$2,$3,$4,$5 elsif para =~/\{(.+?)\s\[(\d[sS]*)\]\}\.\.\/(\S+?)\/(\s+~\{.+?\}~)?/ #syntax e.g.: { "Sphinx or Robot", Leena Krohn [3sS]}../sphinx_or_robot.leena_krohn.1996/ txt,cmd,url_dir,note=$1,$2,$3,$4 end manifest="{#{txt} }#@output_url/#{url_dir}/toc.html#{note}\n\n" else puts "error, does currently support relative paths (reltive paths were removed, as had problems for citation, and was not suited to all output types should possibly reconsider) #{__FILE__} #{__LINE__}" if para =~/\{(?:~\^\s+)?(.+?)\s\[(\d[sS]*)\]\}\.\.\/(\S+?)\/(\s+~\{.+?\}~)?/ txt,cmd,url_dir,note=$1,$2,$3,$4 manifest="{ #{txt} }../#{url_dir}/toc.html#{note}\n\n" end end tuned_file << manifest output_filetypes_in_cmd(cmd,source).each do |o_f| describe = case o_f when /sisu_manifest.html/; '~^ document manifest' when /toc.html/; ' html, segmented text' when /doc.html/; ' html, scroll, document in one' when /landscape.pdf/; ' pdf, landscape' when /portrait.pdf/; ' pdf, portrait' when /opendocument.odt/; ' open document' when /scroll.xhtml/; ' xhtml scroll' when /sax.xml/; ' xml, sax' when /dom.xml/; ' xml, dom' when /plain.txt/; ' plain text utf-8' when /wiki.txt/; ' wiki text' when /concordance.html/; ' concordance' when /digest.txt/; ' dcc, document content certificate (digests)' when /#{source}/; ' markup source text' when /sisupod.zip/; ' zipped markup source pod' else nil end if describe if @output_url tuned_file << "_1 {#{describe} }#@output_url/#{url_dir}/#{o_f}\n\n" if describe else tuned_file << "_1 { #{describe} }../#{url_dir}/#{o_f}\n\n" end end end elsif para =~/<:insert\d+!?>/ and para !~/^%\s+/ @skin.select ins=SiSU_Viz::Inserts.new case para when /^\s*<:insert1>\s*$/ para=[] ins.insert1.split(/\n\n/).each{|x| para << x } when /^\s*<:insert2>\s*$/ para=[] ins.insert2.split(/\n\n/).each{|x| para << x } when /^\s*<:insert3>\s*$/ para=[] ins.insert3.split(/\n\n/).each{|x| para << x << "\n"} para=ins.insert3 when /^\s*<:insert4>\s*$/ para=[] ins.insert4.split(/\n\n/).each{|x| para << x << "\n"} para=ins.insert4 when /^\s*<:insert5>\s*$/ para=[] ins.insert5.split(/\n\n/).each{|x| para << x << "\n"} when /^\s*<:insert6>\s*$/ para=[] ins.insert6.split(/\n\n/).each{|x| para << x << "\n"} when /^\s*<:insert7>\s*$/ para=[] ins.insert7.split(/\n\n/).each{|x| para << x << "\n"} end para.each{|x| tuned_file << x } else tuned_file << para end tuned_file.flatten! tuned_file.compact! end tuned_file end def numbering_song(data) data=number_plaintext_para(data) data=name_endnote_seg(data) #tr issue data=auto_number_heading_ie_title(data) #tr issue data=ocn(data) unless @md.markup =~/not_to/ data=minor_numbering(data) #unless @md.markup =~/not_to/ data=name_para_seg_filename(data) data=set_heading_seg(data) unless @md.set_heading_seg data=set_heading_top(data) unless @md.set_heading_top data=set_header_title(data) unless @md.set_header_title data end def number_plaintext_para(data) @tuned_file=[] data.each do |para| para.gsub!(/(^|[^<][^v][^>])\n/,'\1 ') #messy, but idea is that tables should retain breaks para.gsub!(/^/,"\n") unless para =~/¡/ para.gsub!(/^\s+|\s$/,"\n") @tuned_file << para end @tuned_file=@tuned_file.flatten end def name_endnote_seg(data) @tuned_file=[] data.each do |para| para.gsub!(/<:3>\s*<:ee>/, <<-WOK #{@@endnote['special_align']}


\r #{@@endnote['seg_name_3']}

#{@@endnote['special_align_close']} WOK ) para.gsub!(/<:2>\s*<:ee>/, <<-WOK #{@@endnote['special_align']}


\r #{@@endnote['seg_name_2']}

#{@@endnote['special_align_close']} WOK ) para.gsub!(/<:1>\s*<:ee>/, <<-WOK #{@@endnote['special_align']}


\r #{@@endnote['seg_name_1']}

#{@@endnote['special_align_close']} WOK ) @tuned_file << para end # debug 2003w46 adding revision control info if @md.flag_auto_endnotes and @md.flag_separate_endnotes_make @tuned_file << "\n4~endnotes Endnotes <~0;0:0;u0>" #prob numbering, revisit end @tuned_file << "\n" @tuned_file=@tuned_file.flatten end def owner_details_seg data << '4~owner.details Owner Details' end def number_sub_heading(para,num,title_no) case para when /#{num}~- /; para.gsub!(/#{num}~- /,"#{title_no} ") when /^#{num}~#\s*/; para.gsub!(/^#{num}~#\s*/,"#{title_no} ") when /^#{num}~[a-z_\.]+ / para.gsub!(/^#{num}~([a-z_\.]+)\s+(.+)/i,%{#{num}~\\1 #{title_no} \\2 <:name##{title_no}>}) else para.gsub!(/^#{num}~ /,"#{num}~#{title_no} #{title_no} ") #main end if @md.toc_lev_limit and @md.toc_lev_limit < num para.gsub!(/^[5-8]~(?:~\S+)?\s*/,'!_ ') end para end def auto_number_heading_ie_title(data) #also does some segment naming @tuned_file=[] if @md.markup =~/num_top/ or @md.num_top # watch, 2003w23 input="#{@md.markup}"[/num_top\=([1-6])/,1] if @md.markup input||=@md.num_top if @md.num_top !~/^$/ end num_top=input.to_i t_no1=t_no2=t_no3=t_no4=0 no1=num_top; no2=(num_top + 1); no3=(num_top + 2); no4=(num_top + 3) t_not=0 data.each do |para| #@md.seg_names << [additions to segment names] if (@md.markup =~/num_top/ or (@md.num_top and @md.num_top !~/^$/)) and para !~/^0~/ if (para =~/^(?:#{no1}|^#{no2}|^#{no3}#{no4})~#/ and para !~/^4~endnotes?/) t_not+=1 #; t_no2=0; t_no3=0 para.gsub!(/^(#{no1})~#\s*/,"\\1~ps#{t_not} ") para.gsub!(/^(#{no2})~#\s*/,"\\1~ps#{t_not} ") para.gsub!(/^(#{no3})~#\s*/,"\\1~ps#{t_not} ") para.gsub!(/^(#{no4})~#\s*/,"\\1~ps#{t_not} ") end if para =~/#{no1}~/ @subnumber=1 @subnumber=0 if para =~/#{no1}~/ end if para =~/^[0-6]~[ \w-]/ and para !~ /(?:[0-6]~[\w-]+-|4~endnotes|^[0-6]~([a-z_\.]+)\s+[\d.]+)\s/ and para !~/<~#>|<-#>/ if para =~/^#{no1}~/ t_no1+=1; t_no2=0; t_no3=0 title_no="#{t_no1}" if not @md.seg_names.nil? and not @md.seg_names.include?(title_no) para.gsub!(/^#{no1}~\s+(\S+)#/,"#{no1}~#{title_no} \\1 #{title_no} ") #shift placement of auto-number to after first word, e.g. Article # not # Article, added on occasion of ABF (20040329) para.gsub!(/^#{no1}\{\s+(Article|Clause|Section)\s+#/i,%{#{no1}~#{title_no} \\1 #{title_no}. }) unless para =~/^#{no1}~\s+[\d.]+\s/ #fix -> if the title starts with a numbering scheme, do not auto-number, review para.gsub!(/^#{no1}~\s+/,"#{no1}~#{title_no} #{title_no}. ") end @md.seg_names << title_no #else puts "warning segment name #{title_no} already exists" end unless para =~/^#{no1}~([a-z_\.]+)\s+[A-Z]\.?\s/ #bug -> tmp fix, excludes A. B. C. lettering, but not roman numerals, is arbitrary, review required para.gsub!(/^#{no1}~([a-z_\.]+)\s+(.+)/i,%{#{no1}~\\1 #{title_no}. \\2 <:name##{title_no}>}) end para.gsub!(/^#{no1}~#\s*/,"#{title_no}. ") end if para =~/^#{no2}~/ t_no2+=1; t_no3=0 title_no="#{t_no1}.#{t_no2}" para=number_sub_heading(para,no2,title_no) end if para =~/^#{no3}~/ t_no3+=1 title_no="#{t_no1}.#{t_no2}.#{t_no3}" para=number_sub_heading(para,no3,title_no) end elsif para =~ /^[0-6]~[\w-]+-/ # endnotes, watch2005 para.gsub!(/^#{no1}~([a-z_\.]+)- /,"#{no1}~\\1 ") para.gsub!(/^#{no2}~([a-z_\.]+)- /,"#{no2}~\\1 ") para.gsub!(/^#{no3}~([a-z_\.]+)- /,"#{no3}~\\1 ") end elsif @md.markup =~/num_extract/ #AS DANGEROUS force enable with document, note already does this type of numbering for cisg, locate and coordinate logic, is currently misplaced in code, chengwei inspired 2004w23/4 unless para =~ /^[0-6]~\S+/ #endnotes watch? if para =~/^[1-6]~\s+([\d\.]+)/ #risky (must be unique) consider output to 4~~\d instead of 4~\d name_num=$1 para.gsub!(/^([1-6]~)\s+/,"\\1#{name_num} ") end end if @md.toc_lev_limit end end @tuned_file << para end @tuned_file=@tuned_file.flatten end def ocn(data) #and auto segment numbering increment @tuned_file=[] object_array=SiSU_document_structure::OCN.new(@md,data).ocn object_array.each do |o| @tuned_file <<= if o.ocn; "#{o.txt} <~#{o.ocn};#{o.lv};#{o.type}>" else o.txt end end @tuned_file=@tuned_file.flatten end def minor_numbering(data) #and auto segment numbering increment @tuned_file=[] number_small,letter_small=0,0 letter=%w( a b c d e f g h i j k l m n o p q r s t u v w x y z ) data.each do |para| if para =~/\w|\S|<|\(/ if para !~/^%% |^0~|^4~endnotes|^<\/center>|<:ee>|<:e[:_]>|^\^~ |<:e[:_]\d+?>|^<:p[bn]>|^<:\#|<:- |<[:!]!4|^(?:alt|code|group|poem|table)\{|^\}(?:alt|code|group|poem|table)|^\}table$|||||<\/tr>|


|\[endnotes\]|<:zz>|<:isbn-|<:journal-|<:conference-|/i #ocn here #  added with Tune.code #¡ if para=~/^[1-8]~/; number_small,letter_small=0,0 #% sub-number system, (baby numbering) reset with any change of major number (more obviously should be placed in number titles, but that is conditionally executed, check and move later) end if para =~/^#[ 1]/ letter_small=0 number_small=0 if para =~ /^#1/ number_small+=1 para.gsub!(/^#[ 1]/,"#{number_small}. ") #change 2004 end if para =~/^_# / para.gsub!(/^_# /,"<:i1> #{letter[letter_small]}. ") #change 2004 letter_small+=1 end end end @tuned_file << para end @tuned_file=@tuned_file.flatten end def name_para_seg_filename(data) # paragraph name/numbering rules # manual naming overrides, manual naming may be # alpha-numeric characters mixed, # numeric only (a number), if # all segments have been named, # the numbers used are over 1000 or # it is not minded that auto-numbering uses a funny scheme for naming segments (not yet implemented) # [for now a warning is printed for such documents on use of maintenance or very-verbose flag] # auto-naming takes the form of giving numbers to segments # the rules for which are as follows # if the title/heading text starts with a numeric, then that is used (1 3.1 3rd etc.) # otherwise the level 4 segment number from the embedded document structure info is used # if there is none a sequential number is designated, preceded by an underscore @tuned_file=[] art_filename_auto=1 @counter=1 @unique_auto_name=[] puts 'manual segment names, numbers used as names, risk warning (segmented html)' if not @md.seg_autoname_safe and @md.cmd =~/[MV]/ data.each do |para| para=SiSU_document_structure::Structure.new(@md,para).structure_markup if para !~/^0~/ if para =~/^[456]~ / if para=~/^4/ and not @md.set_heading_seg @md.set_heading_seg=true end if para =~/^[456]~(?:\s\S+)?\s+([\d.,:-]+)/m #heading starts with a recognised numeric or word followed by a recognised numerical construct, use that as name pattern=$1 pattern.gsub!(/(?:[:,-]|\W)/,'.') pattern.gsub!(/\.$/,'') if not @md.seg_names.nil? and not @md.seg_names.include?(pattern) para.gsub!(/^([456])~\s*/,"\\1~#{pattern} ") @md.seg_names << pattern else puts 'warn, there may be a conflicting numbering scheme' if @md.cmd =~/[VM]/ end end if para =~/^4~\s.+?;4:(\d+);/m #extract segment name from embedded document structure info pattern=$1 pattern.gsub!(/(?:[:,-]|\W)/,'.') pattern.gsub!(/\.$/,'') if not @md.seg_names.nil? and not @md.seg_names.include?(pattern) para.gsub!(/^(4)~\s*/,"\\1~#{pattern} ") @md.seg_names << pattern else para.gsub!(/^(4)~\s*/,"\\1~~#{pattern} ") @md.seg_names << "~#{pattern}" end end if para =~/^4~\s+/ #if still not segment name, provide a numerical one if not @md.seg_names.nil? and not @md.seg_names.include?(art_filename_auto) para.gsub!(/^4~\s+/,%{4~_#{art_filename_auto} }) @md.seg_names << art_filename_auto else puts 'segment name (numbering) error' end art_filename_auto+=1 end end end @tuned_file << if para =~/^([1-6])~/m and (@md.pagenew or @md.pagebreak); m=$1 #watch ref~ para_tmp=[] if @md.pagenew.to_s =~/#{m}/; para_tmp << "<:pn>\n" << para end if @md.pagebreak.to_s =~/#{m}/; para_tmp << "<:pb>\n" << para end para_result=unless para_tmp.length > 0; para else para_tmp end else para end end if @md.seg_names.length > 0 @md.set_heading_seg=true end @tuned_file=@tuned_file.flatten end def set_heading_top(data) #% make sure no false positives unless @md.set_heading_top puts "\tdocument contains no top level heading, (will have to manufacture one)" if @md.cmd =~/[MV]/ @tuned_file=[] data.each do |para| unless @md.set_heading_top if para !~/^(?:@\S+:|0~\S+)\s/m and para !~/\A\s*\Z/m @md.set_heading_top=true head=if @md.title ; "1~ #{@md.title}" else '1~ [no title provided]' end @tuned_file << head end end @tuned_file << para end @tuned_file=@tuned_file.flatten end end def set_heading_seg(data) #% make sure no false positives unless @md.set_heading_seg puts "\tdocument contains no segment level, (will have to manufacture one)" if @md.cmd =~/[MV]/ @tuned_file=[] data.each do |para| unless @md.set_heading_seg if para !~/^(?:@\S+:|0~\S+|[123]~)/m and para !~/\A\s*\Z/m and para !~/<:p[bn]>/ @md.set_heading_seg=true head=if @md.title ; "4~seg [#{@md.title}]" else '4~seg [segment]' end @tuned_file << head end end @tuned_file << para end @tuned_file=@tuned_file.flatten end end def set_header_title(data) #% make sure no false positives unless @md.set_header_title puts "\t no document title provided, (will have to manufacture one)" if @md.cmd =~/[MV]/ @tuned_file=[] data.each do |para| unless @md.set_header_title if para !~/^%{1,2}\s/m and para !~/\A\s*\Z/m @tuned_file << "0~title #{@md.heading_seg_first}" @md.title=@md.heading_seg_first @md.set_header_title=true end end @tuned_file << para end @tuned_file=@tuned_file.flatten end end def endnotes(data) @tuned_file=[] endnote_no,endnote_ref=1,1 #% endnote work zone data.each do |para| # manually numbered endnotes --> if @md.mod.inspect =~/--no-asterisk|--no-annotate/ para.gsub!(/~\[[*]\s.+?\]~/,'') end if @md.mod.inspect =~/--no-dagger|--no-annotate/ para.gsub!(/~\[[+]\s.+?\]~/,'') end case para # auto-numbered endnotes --> when /~\{\s+.+?\}~|~\[[*+]\s+.+?\]~/ para.gsub!(/\s*(\}~|\]~)/,' \1') # required 2003w31 word_mode=para.scan(/\S+/) word_mode=endnote_call_number(word_mode) para=word_mode.join(' ') endnote_ref+=1 when /~\^(?:\s|$)|<:e>/ #%Note inserts endnotes previously gathered from /^(|[-~]\{{3})/ (in earlier loop) word_mode=para.scan(/\S+/) word_mode=endnote_call_number(word_mode) para=word_mode.join(' ') endnote_ref+=1 end @tuned_file << para end @tuned_file=@tuned_file.flatten end def endnote_call_number(data) data.each do |word| case word when /~\{/ unless word =~/~\{[*+]+/ word.gsub!(/~\{/,"~\{#{@@endnote_counter} ") @@endnote_counter+=1 end when /~\[/ if word =~/~\[[+]/ word.gsub!(/~\[[+]/,"~\[\+#{@@endnote_counter_dag} ") @@endnote_counter_dag+=1 else word.gsub!(/~\[[*]?/,"~\[\*#{@@endnote_counter_asterisk} ") @@endnote_counter_asterisk+=1 end when /~\^|<:e>/ word.gsub!(/~\^|<:e>/,"#{@@endnote_array[@@endnote_counter-1]}") @@endnote_counter+=1 end end end def metadata(data) meta,@dc,@rc,@cvs,dctitle,add=Array.new(6){[]} dir=SiSU_Env::Info_env.new(@md.fns) base_html="#{dir.url.root}/#{@md.fnb}" ocnm=ocnd=ocnv=0 ocnm+=1 header0='<:pn>' header1="\n1~ Document Information <~0;0:0;m#{ocnm}>" ocnm+=1 header4="\n4~metadata MetaData <~0;m#{ocnm};m#{ocnm}>" ocnm+=1; ocnd+=1 head_no_dc="<~0;m#{ocnm};d#{ocnd}>" ocnm+=1; ocnd+=1 head_no_dc_tag="<~0;m#{ocnm};d#{ocnd}>" data.each do |para| case para when /^0~(title|creator|author|translator|translated_by|illustrator|illustrated_by|prepared_by|digitized_by|description|publisher|contributor|date\.created|date\.issued|date\.available|date\.valid|date\.modified|date|type|format|rights|identifier|source|language)/i m=$1 ocnm+=1; ocnd+=1 @dc << case para when /^0~title/ "\n#{@tr.dc_title}: #{@md.dc_title} <~0;m#{ocnm};d#{ocnd}>" when /^0~(?:creator|author)/ "\n#{@tr.creator}: #{@md.dc_creator} <~0;m#{ocnm};d#{ocnd}>" when /0~(?:translator|translated_by)/ "\n#{@tr.translator}: #{@md.translator} <~0;m#{ocnm};d#{ocnd}>" when /^0~(?:illustrator|illustrated_by)/ "\n#{@tr.illustrator}: #{@md.illustrator} <~0;m#{ocnm};d#{ocnd}>" when /^0~prepared_by/ "\n#{@tr.prepared_by}: #{@md.prepared_by} <~0;m#{ocnm};d#{ocnd}>" when /^0~digitized_by/ "\n#{@tr.digitized_by}: #{@md.digitized_by} <~0;m#{ocnm};d#{ocnd}>" when /^0~description/ "\n#{@tr.description}: #{@md.dc_description} <~0;m#{ocnm};d#{ocnd}>" when /^0~subject/ "\n#{@tr.subject}: #{@md.dc_subject} <~0;m#{ocnm};d#{ocnd}>" when /^0~abstract/ "\n#{@tr.abstract}: #{@md.dc_abstract} <~0;m#{ocnm};d#{ocnd}>" when /^0~publisher/ "\n#{@tr.publisher}: #{@md.dc_publisher} <~0;m#{ocnm};d#{ocnd}>" when /^0~contributor/ "\n#{@tr.contributor}: #{@md.dc_contributor} <~0;m#{ocnm};d#{ocnd}>" when /^0~date.created/ "\n#{@tr.date_created}: #{@md.dc_date_created} <~0;m#{ocnm};d#{ocnd}>" when /^0~date.issued/ "\n#{@tr.date_issued}: #{@md.dc_date_issued} <~0;m#{ocnm};d#{ocnd}>" when /^0~date.available/ "\n#{@tr.date_available}: #{@md.dc_date_available} <~0;m#{ocnm};d#{ocnd}>" when /^0~date.modified/ "\n#{@tr.date_modified}: #{@md.dc_date_modified} <~0;m#{ocnm};d#{ocnd}>" when /^0~date.valid/ "\n#{@tr.date_valid}: #{@md.dc_date_valid} <~0;m#{ocnm};d#{ocnd}>" when /^0~date/ "\n#{@tr.date}: #{@md.dc_date} <~0;m#{ocnm};d#{ocnd}>" when /^0~type/ "\n#{@tr.type}: #{@md.dc_type} <~0;m#{ocnm};d#{ocnd}>" when /^0~format/ "\n#{@tr.format}: #{@md.dc_format} <~0;m#{ocnm};d#{ocnd}>" when /^0~rights/ "\n#{@tr.rights}: #{@md.dc_rights} <~0;m#{ocnm};d#{ocnd}>" when /^0~identifier/ "\n#{@tr.identifier}: #{@md.dc_identifier} <~0;m#{ocnm};d#{ocnd}>" when /^0~source/ "\n#{@tr.source}: #{@md.dc_source} <~0;m#{ocnm};d#{ocnd}>" when /^0~language/ "\n#{@tr.language}: #{@md.dc_language} <~0;m#{ocnm};d#{ocnd}>" when /^0~language.original/ "\n#{@tr.language_original}: #{@md.language_original} <~0;m#{ocnm};d#{ocnd}>" when /^0~relation/ "\n#{@tr.relation}: #{@md.dc_relation} <~0;m#{ocnm};d#{ocnd}>" when /^0~coverage/ "\n#{@tr.coverage}: #{@md.dc_coverage} <~0;m#{ocnm};d#{ocnd}>" when /^0~keywords/ "\n#{@tr.keywords}: #{@md.keywords} <~0;m#{ocnm};d#{ocnd}>" when /^0~comments/ "\n#{@tr.comments}: #{@md.comments} <~0;m#{ocnm};d#{ocnd}>" when /^0~cls_loc/ "\n#{@cls_dewey}: #{@md.cls_dewey} <~0;m#{ocnm};d#{ocnd}>" when /^0~cls_dewey/ "\n#{@tr.cls_dewey}: #{@md.cls_dewey} <~0;m#{ocnm};d#{ocnd}>" when /^0~cls_gutenberg|0~cls_pg/ "\n#{@tr.cls_gutenberg}: #{@md.cls_gutenberg} <~0;m#{ocnm};d#{ocnd}>" #"\n#{@tr.cls_gutenberg}: #{@md.cls_pg} <~0;m#{ocnm};d#{ocnd}>" when /^0~cls_isbn/ "\n#{@tr.cls_isbn}: #{@md.cls_isbn} <~0;m#{ocnm};d#{ocnd}>" when /^0~prefix(?:_a)?/ "\n#{@tr.prefix_a}: #{@md.prefix_a} <~0;m#{ocnm};d#{ocnd}>" when /^0~prefix_b/ "\n#{@tr.prefix_b}: #{@md.prefix_b} <~0;m#{ocnm};d#{ocnd}>" else para.gsub(/^0~(#{m})\s+(.+)/m,"\n#{m.capitalize}: \\2 <~0;m#{ocnm};d#{ocnd}>") end end end ocnm+=1; ocnv+=1 head_no_rc="<~0;m#{ocnm};v#{ocnv}>" ocnm+=1; ocnv+=1 head_no_rc_tag="<~0;m#{ocnm};v#{ocnv}>" data.each do |para| case para when /^0~(?:cvs|rcs)\+\s+/ #note the + sign to turn on use of cvs id ocnm+=1; ocnv+=1 @cvs << "#{@tr.sc_number}: #{@md.sc_number} <~0;m#{ocnm};v#{ocnv}>" ocnm+=1; ocnv+=1 @cvs << "#{@tr.sc_date}: #{@md.sc_date} <~0;m#{ocnm};v#{ocnv}>" ocnm+=1; ocnv+=1 @cvs << "CVS/RCS time: #{@md.sc_time} <~0;m#{ocnm};v#{ocnv}>" ocnm+=1; ocnv+=1 when /^0~cvs[+\s]/ #enable pattern above instead if you wish the default to be to include cvs tags from all documents KEEP when /^0~cvs\s+/ #enable pattern above instead if you wish the default to be to include cvs tags from all documents KEEP end end if true #default version information ocnm+=1; ocnv+=1 if @md.sc_filename and @md.sc_filename.length > 3 @rc << "#{@tr.sourcefile}: #{@md.sc_filename} <~0;m#{ocnm};v#{ocnv}>" else @rc << "#{@tr.sourcefile}: #{@md.fns} <~0;m#{ocnm};v#{ocnv}>" end ocnm+=1; ocnv+=1 if @md.file_encoding and @md.file_encoding.length > 3 #translate @rc << "Filetype: #{@md.file_encoding} <~0;m#{ocnm};v#{ocnv}>" end ocnm+=1; ocnv+=1 if @md.dgst #change. enable by default @rc << "#{@tr.sourcefile_digest}, #{@md.dgst[0]} #{@md.dgst[1]} <~0;m#{ocnm};v#{ocnv}>" ocnm+=1; ocnv+=1 end if @md.dgst_skin #change. enable by default @rc << "Skin_Digest: #{@md.dgst_skin[0]} #{@md.dgst_skin[1]} <~0;m#{ocnm};v#{ocnv}>" ocnm+=1; ocnv+=1 end @rc << "Generated #{head_no_rc}" if @rc.length > 0 @rc << "#{@tr.last_generated}: #{Time.now} <~0;m#{ocnm};v#{ocnv}>" ocnm+=1; ocnv+=1 if @md.sisu_version[:version] @rc << "#{@tr.sisu_version}: #{@md.sisu_version[:project]} #{@md.sisu_version[:version]} of #{@md.sisu_version[:date_stamp]} (#{@md.sisu_version[:date]}) <~0;m#{ocnm};v#{ocnv}>" ocnm+=1; ocnv+=1 end @rc << "#{@tr.ruby_version}: #{@md.ruby_version} <~0;m#{ocnm};v#{ocnv}>" end meta << header0 meta << header1 meta << header4 meta << "Document Manifest @\n #{base_html}/#{@md.fn[:manifest]} <~0;m#{ocnm};m#{ocnm}>" meta << "Dublin Core (DC) #{head_no_dc}" if @dc.length > 0 meta << "DC tags included with this document are provided here. #{head_no_dc_tag}" if @dc.length > 0 @dc.each { |x| meta << x } meta << "Version Information #{head_no_rc}" if @rc.length > 0 if @cvs.length > 0 meta << "Note the version information provided here, is specific to the host site. #{head_no_rc_tag}" @cvs.each { |x| meta << x } end @rc.each { |x| meta << x } ## ENDNOTE RELATED endnote related meta << "\n" meta=object_digest(meta) end def stamped(para,hash_class) @tuned=[] para=strip_clean_extra_spaces(para) digest_all=hash_class.hexdigest(para) # print "#{hash_class.name}: "; puts digest_all #length==32 or 64 stripped=strip_clean_of_markup(para) digest_strip=hash_class.hexdigest(stripped) case para when /~\{[\d*+]+\s+.+?\}~|~\[[*+]\d+\s+.+?\]~/ en_and_para,en_and_para_digest=[],[] para.gsub!(/\s*(\}~|\]~)/,' \1') #watch para_plus_en=para.scan(/.*?~\{.+?\}~|.*?~\[.+?\]~/) para_tail=if para =~/(?:.*?~\{.+?\}~|.*?~\[.+?\]~)+([\s\S]+)/ /(?:.*?~\{.+?\}~|.*?~\[.+?\]~)+(.+?<~\d+;(?:\w|[0-6]:)\d+;\w\d+>)/.match(para)[1] else '' end para_plus_en << para_tail en_and_para_digest << endnote_digest(para_plus_en) para_new=en_and_para_digest.join(' ') @tuned << para_new + '<' + digest_strip + ':' + digest_all + '>' unless para.nil? else @tuned << para + '<' + digest_strip + ':' + digest_all + '>' unless para.nil? end @tuned.join end def object_digest(data) # 1. clean/stripped text without any markup, paragraph, headings etc. without endnotes # 2. endnotes clean/stripped text digest only (there may be several endnotes within a paragraph) # 3. whole object, text with markup and any endnotes, (question: with or without the endnote digests??? presumption better without, [however may be easier to check with?]) # [digests should not include other digests] # vim==/<[0-9a-f]\{#{@@dl}\}\(:[0-9a-f]\{#{@@dl}\}\)\?>/ require 'digest/md5' require 'digest/sha2' @tuned_file=[] data.compact! data.each do |para| para.strip! if para=~/<~\d+;(?:\w|[0-6]:)\d+;\w\d+>/ if @env.digest.type =~/sha256/ for hash_class in [ Digest::SHA256 ] @tuned_file << stamped(para,hash_class) end else for hash_class in [ Digest::MD5 ] @tuned_file << stamped(para,hash_class) end end else @tuned_file << para unless para.nil? end end @tuned_file=@tuned_file.flatten #use md5 or to create hash of each dal object including ocn, & add into to each dal object end def endnote_digest(data) para_bit=[] data.each do |en_plus| para_bit <<= case en_plus when /~\{|~\[/ if en_plus =~/~\{.+?\}~|~\[.+?\]~/ para_txt,en_open,en_txt,en_close=/(.*?)(~\{|~\[)(.+?)(\}~|\]~)/m.match(en_plus)[1..4] stripped_en=strip_clean_of_markup(en_txt) if @env.digest.type =~/sha256/ digest_en_strip=Digest::SHA256.hexdigest(stripped_en) else digest_en_strip=Digest::MD5.hexdigest(stripped_en) end para_txt + en_open + en_txt + '<' + digest_en_strip + '>' + en_close else puts "Error Exception - problem encountered with:\n#{en_plus}" #arbitrary exception, tidy up end else en_plus end end para_bit.join end def strip_clean_extra_spaces(s) # dal output tuned s=s.dup s=s.gsub(/[ ]+([,.;:?](?:$|\s))/,'\1') s=s.gsub(/ [ ]+/,' ') s=s.gsub(/^ [ ]+/,'') s=s.gsub(/ [ ]+$/,'') s=s.gsub(/(<\/[bi]>')[ ]+(s )/,'\1\2') end def strip_clean_of_markup(s) # used for digest, define rules, make same as in db clean #consider: <\/?[ib]>|<(?:\/ )?br>|(.+?)<\/del> s=s.dup s=s.gsub(/(?:<\/?[ib]>|<~\d+;(?:\w|[0-6]:)\d+;\w\d+>|<#@dp:#@dp>|^[1-6]~\S+|~\{\d+\s.+?\}~)/,'') # markup and endnotes removed #% same as db clean --> s=s.gsub(/(.+?)<\/del>/,'DELETED(\1)') # deletions s=s.gsub(/(\d+)<\/sup>/,'[\1]') s=s.gsub(/(?: \\;)+/,' ') #s=s.gsub(//,"[TABLE]\n") # tables #s=s.gsub(//,'\1') # tables #s=s.gsub(/¡¡\d+¡/,' ') # tables #s=s.gsub(/¡/,' ') # tables tidy later #s=s.gsub(/<.+?>/,'') s=s.gsub(/\{.+?\.(?:png|jpg|gif).+?\}(?:https?|ftp)\\\:\S+ /,' [image] ') # else image names found in search s=s.gsub(/\s\s+/,' ') s=s.strip end end end __END__ dal output, rules to simplify parsing nodes === objects === paragraphs === text blocks separated by \n\n dal output: :verse :group and :code have -end :table is not used