# coding: utf-8
=begin
* Name: SiSU
* Description: a framework for document structuring, publishing and search
* Author: Ralph Amissah
* Copyright: (C) 1997 - 2010, Ralph Amissah, All Rights Reserved.
* License: GPL 3 or later:
SiSU, a framework for document structuring, publishing and search
Copyright (C) Ralph Amissah
This program is free software: you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation, either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program. If not, see .
If you have Internet connection, the latest version of the GPL should be
available at these locations:
* SiSU uses:
* Standard SiSU markup syntax,
* Standard SiSU meta-markup syntax, and the
* Standard SiSU object citation numbering and system
* Hompages:
* Download:
* Ralph Amissah
** Description: system environment, resource control and configuration details
=end
module SiSU_hash
class Object_digest
def initialize(md,data,env=nil)
@md,@data,@env=md,data,env
@env ||=SiSU_Env::Info_env.new(@md.fns)
end
def object_digest
# 1. clean/stripped text without any markup, paragraph, headings etc. without endnotes
# 2. endnotes clean/stripped text digest only (there may be several endnotes within a paragraph)
# 3. whole object, text with markup and any endnotes, (question: with or without the endnote digests??? presumption better without, [however may be easier to check with?])
# [digests should not include other digests]
# vim==/<[0-9a-f]\{#{@@dl}\}\(:[0-9a-f]\{#{@@dl}\}\)\?>/
require 'digest/md5'
require 'digest/sha2'
data=@data
@tuned_file=[]
data.compact!
data.each do |para|
para.strip!
if para=~/#{Mx[:id_o]}~\d+;(?:\w|[0-6]:)\d+;\w\d+#{Mx[:id_c]}/ \
and para !~/#{Rx[:meta]}/ #test should not be necessary remove
if @env.digest.type =~/sha256/
for hash_class in [ Digest::SHA256 ]
@tuned_file << stamped(para,hash_class)
end
else
for hash_class in [ Digest::MD5 ]
@tuned_file << stamped(para,hash_class)
end
end
else @tuned_file << para unless para.nil?
end
end
@tuned_file=@tuned_file.flatten
#use md5 or to create hash of each dal object including ocn, & add into to each dal object
end
def endnote_digest(data)
para_bit=[]
data.each do |en_plus|
para_bit <<= case en_plus
when /#{Mx[:en_a_o]}|#{Mx[:en_b_o]}/
if en_plus =~/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}|#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/
para_txt,en_open,en_txt,en_close=/(.*?)(#{Mx[:en_a_o]}|#{Mx[:en_b_o]})(.+?)(#{Mx[:en_a_c]}|#{Mx[:en_b_c]})/m.match(en_plus)[1..4]
stripped_en=strip_clean_of_markup(en_txt)
digest_en_strip=if @env.digest.type =~/sha256/
Digest::SHA256.hexdigest(stripped_en)
else
Digest::MD5.hexdigest(stripped_en)
end
para_txt + en_open + en_txt + Mx[:id_o] + digest_en_strip + Mx[:id_c] + en_close
else puts "Error Exception - problem encountered with:\n#{en_plus}" #arbitrary exception, tidy up
end
else en_plus
end
end
para_bit.join
end
def stamped(para,hash_class)
@tuned=[]
para=strip_clean_extra_spaces(para)
digest_all=hash_class.hexdigest(para) # print "#{hash_class.name}: "; puts digest_all #length==32 or 64
stripped=strip_clean_of_markup(para)
digest_strip=hash_class.hexdigest(stripped)
unless para =~/#{Mx[:fa_o]}code#{Mx[:fa_c]}/
case para
when /#{Mx[:en_a_o]}[\d*+]+\s+.+?#{Mx[:en_a_c]}|#{Mx[:en_b_o]}[*+]\d+\s+.+?#{Mx[:en_b_c]}/m
en_and_para,en_and_para_digest=[],[]
para.gsub!(/\s*(#{Mx[:en_a_c]}|#{Mx[:en_b_c]})/m,' \1') #watch
para_plus_en=para.scan(/.*?#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}|.*?#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m)
para_tail=if para =~/(?:.*?#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}|.*?#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]})+([\s\S]+)/m
/(?:.*?#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}|.*?#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]})+(.*?#{Mx[:id_o]}~\d+;(?:\w|[0-6]:)\d+;\w\d+#{Mx[:id_c]})/m.match(para)[1]
#/(?:.*?#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}|.*?#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]})+(.+?#{Mx[:id_o]}~\d+;(?:\w|[0-6]:)\d+;\w\d+#{Mx[:id_c]})/m.match(para)[1]
else ''
end
para_plus_en << para_tail
en_and_para_digest << endnote_digest(para_plus_en)
para_new=en_and_para_digest.join(' ')
@tuned << para_new + Mx[:id_o] + digest_strip + ':' + digest_all + Mx[:id_c] unless para.nil?
else @tuned << para + Mx[:id_o] + digest_strip + ':' + digest_all + Mx[:id_c] unless para.nil?
end
else @tuned << para + Mx[:id_o] + digest_strip + ':' + digest_all + Mx[:id_c] unless para.nil?
end
@tuned.join
end
def strip_clean_extra_spaces(s) # dal output tuned
s=s.dup
s=s.gsub(/[ ]+([,.;:?](?:$|\s))/,'\1') unless s =~/#{Mx[:en_a_o]}|#{Mx[:en_b_o]}/
s=s.gsub(/ [ ]+/,' ')
s=s.gsub(/^ [ ]+/,'')
s=s.gsub(/ [ ]+$/,'')
s=s.gsub(/((?:#{Mx[:fa_bold_c]}|#{Mx[:fa_italics_c]})')[ ]+(s )/,'\1\2')
s=s.gsub(/((?:#{Mx[:fa_bold_c]}|#{Mx[:fa_italics_c]})')[ ]+(s )/,'\1\2')
end
def strip_clean_of_markup(s) # used for digest, define rules, make same as in db clean
#consider: <\/?[ib]>|<(?:\/ )?br>|(.+?)<\/del>
s=s.dup
s=s.gsub(/(?:<\/?[ib]>|#{Mx[:id_o]}~\d+;(?:\w|[0-6]:)\d+;\w\d+#{Mx[:id_c]}|#{Mx[:id_o]}#@dp:#@dp#{Mx[:id_c]}|^#{Mx[:lv_o]}[1-6]:\S+?#{Mx[:lv_c]}|#{Mx[:en_a_o]}\d+\s.+?#{Mx[:en_a_c]})/m,'') # markup and endnotes removed
#% same as db clean -->
s=s.gsub(/(.+?)<\/del>/,'DELETED(\1)') # deletions
s=s.gsub(/(\d+)<\/sup>/,'[\1]')
s=s.gsub(/(?:#{Mx[:nbsp]})+/,' ')
#s=s.gsub(//,"[TABLE]\n") # tables
#s=s.gsub(//,'\1') # tables
#s=s.gsub(/¡¡\d+¡/,' ') # tables
#s=s.gsub(/¡/,' ') # tables tidy later
#s=s.gsub(/<.+?>/,'')
s=s.gsub(/#{Mx[:lnk_o]}.+?\.(?:png|jpg|gif).+?#{Mx[:lnk_c]}(?:https?|file|ftp)\\\:\S+ /,' [image] ') # else image names found in search
s=s.gsub(/\s\s+/,' ')
s=s.strip
end
end
end
__END__