From 75e3bf86382edf99275a25895b362647158e25c1 Mon Sep 17 00:00:00 2001 From: Ralph Amissah Date: Tue, 10 Jan 2012 22:37:26 -0500 Subject: v3dv, add dev branch (use to make some changes to module & class names & test) * (intended as) short term branch, merge back into v3 once tested * sisu --dev (to invoke) --- lib/sisu/v3dv/harvest_topics.rb | 650 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 650 insertions(+) create mode 100644 lib/sisu/v3dv/harvest_topics.rb (limited to 'lib/sisu/v3dv/harvest_topics.rb') diff --git a/lib/sisu/v3dv/harvest_topics.rb b/lib/sisu/v3dv/harvest_topics.rb new file mode 100644 index 00000000..de03b615 --- /dev/null +++ b/lib/sisu/v3dv/harvest_topics.rb @@ -0,0 +1,650 @@ +# encoding: utf-8 +=begin + + * Name: SiSU + + * Description: a framework for document structuring, publishing and search + metadata harvest, extract topics and associated writings from document set + (topics use topic_register header) + + * Author: Ralph Amissah + + * Copyright: (C) 1997 - 2012, Ralph Amissah, All Rights Reserved. + + * License: GPL 3 or later: + + SiSU, a framework for document structuring, publishing and search + + Copyright (C) Ralph Amissah + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the Free + Software Foundation, either version 3 of the License, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along with + this program. If not, see . + + If you have Internet connection, the latest version of the GPL should be + available at these locations: + + + + + + + + * SiSU uses: + * Standard SiSU markup syntax, + * Standard SiSU meta-markup syntax, and the + * Standard SiSU object citation numbering and system + + * Hompages: + + + + * Download: + + + * Ralph Amissah + + + + ** Description: simple xml representation (sax style) + +=end +module HARVEST_topics + require_relative 'author_format' # author_format.rb + include SiSU_Viz + class Songsheet + @@the_idx_topics={} + def initialize(opt,env) + @opt,@env=opt,env + @file_list=opt.files + end + def songsheet + puts 'topics:' + idx_array={} + @opt.f_pths.each do |y| + lang_hash_file_array={} + name=y[:f] + filename=y[:pth] + '/' + y[:f] + File.open(filename,'r') do |file| + file.each_line("\n\n") do |line| + if line =~/^@(?:title|creator|classify):(?:\s|$)/m + lang_hash_file_array[y[:lng_is]] ||= [] + lang_hash_file_array[y[:lng_is]] << line + elsif line =~/^@\S+?:(?:\s|$)/m \ + or line =~/^(?:\s*\n|%+ )/ + else break + end + end + end + lang_hash_file_array.each_pair do |lang,a| + idx_array[lang] ||= [] + idx_array=HARVEST_topics::Harvest.new(@opt,@env,a,filename,name,idx_array,lang).extract_harvest + end + end + the_idx=HARVEST_topics::Index.new(@opt,@env,idx_array,@@the_idx_topics).construct_book_topic_index + HARVEST_topics::Output_index.new(@opt,the_idx).html_print.html_songsheet + end + end + class Harvest + def initialize(opt,env,data,filename,name,idx_array,lang) + @opt,@env,@data,@filename,@name,@idx_array,@lang=opt,env,data,filename,name,idx_array,lang + end + def extract_harvest + data,filename,name,idx_array,lang=@data,@filename,@name,@idx_array,@lang + @idx_lst,@title,@subtitle,@fulltitle,@author,@author_format=nil,nil,nil,nil,nil,nil + rgx={} + rgx[:author]=/^@creator:(?:[ ]+|.+?:author:[ ]+)(.+?)(?:\||\n)/m + rgx[:title]=/^@title:[ ]+(.+)/ + rgx[:subtitle]=/^@title:.+?:subtitle:[ ]+(.+?)\n/m + rgx[:idx]=/^@classify:.+?:topic_register:[ ]+(.+?)\n/m + data.each do |para| + if para=~ rgx[:idx] + @idx_list=rgx[:idx].match(para)[1] + end + if para=~ rgx[:title] + @title=rgx[:title].match(para)[1] + end + if para=~ rgx[:subtitle] + @subtitle=rgx[:subtitle].match(para)[1] + end + if para=~ rgx[:author] + @author_format=rgx[:author].match(para)[1] + end + break if @title and @subtitle and @author and @idx_lst + end + @fulltitle=@subtitle ? (@title + ' - ' + @subtitle) : @title + if @title \ + and @author_format \ + and @idx_list + creator=FORMAT::Author.new(@author_format.strip).author_details + @authors,@authorship=creator[:authors],creator[:authorship] + file=if name=~/~[a-z]{2,3}\.ss[mt]$/ + name.sub(/~[a-z]{2,3}\.ss[mt]$/,'') + else + name.sub(/\.ss[mt]$/,'') + end + page=if @env.output_dir_structure.by_language_code? + #fix + end + page=if @env.output_dir_structure.by_language_code? + "#{lang}/sisu_manifest.html" + else + "sisu_manifest.#{lang}.html" + end + idx_array[lang] <<=if @idx_list =~/;/ + g=@idx_list.scan(/[^;]+/) + idxl=[] + g.each do |i| + i.strip! + idxl << { filename: filename, file: file, rough_idx: i, title: @fulltitle, author: creator, page: page, lang: lang } + end + idxl + else { filename: filename, file: file, rough_idx: @idx_list, title: @fulltitle, author: creator, page: page, lang: lang } + end + else + p "missing required field in #{@filename} - [title]: <<#{@title}>>; [author]: <<#{@author_format}>>; [idx]: <<#{@idx_list}>>" if @opt.cmd.inspect =~/[VM]/ + end + idx_array[lang].flatten! + idx_array + end + end + class Index + def initialize(opt,env,idx_array,the_idx) + @opt,@env,@idx_array,@the_idx=opt,env,idx_array,the_idx + @@the_idx_topics=@the_idx + end + def capital(txt) + txt[0].chr.capitalize + txt[1,txt.length] + end + def contents(lang,hash,idx) + names='' + idx[:author][:last_first_format_a].each do |n| + s=n.sub(/(.+?)(?:,.+|$)/,'\1').gsub(/\s+/,'_') + names=if @env.output_dir_structure.by_language_code? + names += %{#{n}, } + else + names += %{#{n}, } + end + end + hash << { filename: idx[:filename], file: idx[:file], author: names, title: idx[:title], page: idx[:page] } + end + def construct_book_topic_index + idx_array=@idx_array + idx_array.each_pair do |lang,idx_array| + @@the_idx_topics[lang] ||= {} + idx_array.each do |idx| + @lv0,@lv1,@lv2,@lv3,@lv4={},{},{},{},{} + if idx[:rough_idx] + idx_lst=idx[:rough_idx].scan(/[^:]+/) + else + puts "no topic register in: << #{idx[:filename]} >>" + next + end + idx_lst_alt=[] + idx_lst.each {|lev| idx_lst_alt << lev.scan(/[^|]+/)} + depth = idx_lst_alt.length - 1 + range = 0..depth + range.each do |t| + if idx_lst_alt[t] + case t + when 0 + lev0=idx_lst_alt[t] + lev0.each do |lv0| + lv0=capital(lv0) + if @@the_idx_topics[lang][lv0].class==NilClass + @@the_idx_topics[lang][lv0]={ md: [] } + end + @lv0=lv0 if lev0.length==1 + j=@@the_idx_topics[lang][lv0][:md] + contents(lang,j,idx) if idx_lst_alt.length - 1 == t + end + when 1 + lev1=idx_lst_alt[t] + lev1.each do |lv1| + lv1=capital(lv1) + if @@the_idx_topics[lang][@lv0][lv1].class==NilClass + @@the_idx_topics[lang][@lv0][lv1]={ md: [] } + end + @lv1=lv1 if lev1.length==1 + j=@@the_idx_topics[lang][@lv0][lv1][:md] + contents(lang,j,idx) if idx_lst_alt.length - 1 == t + end + when 2 + lev2=idx_lst_alt[t] + lev2.each do |lv2| + lv2=capital(lv2) + if @@the_idx_topics[lang][@lv0][@lv1][lv2].class==NilClass + @@the_idx_topics[lang][@lv0][@lv1][lv2]={ md: [] } + end + @lv2=lv2 if lev2.length==1 + j=@@the_idx_topics[lang][@lv0][@lv1][lv2][:md] + contents(lang,j,idx) if idx_lst_alt.length - 1 == t + end + when 3 + lev3=idx_lst_alt[t] + lev3.each do |lv3| + lv3=capital(lv3) + if @@the_idx_topics[lang][@lv0][@lv1][@lv2][lv3].class==NilClass + @@the_idx_topics[lang][@lv0][@lv1][@lv2][lv3]={ md: [] } + end + @lv3=lv3 if lev3.length==1 + j=@@the_idx_topics[lang][@lv0][@lv1][@lv2][lv3][:md] + contents(lang,j,idx) if idx_lst_alt.length - 1 == t + end + when 4 + lev4=idx_lst_alt[t] + lev4.each do |lv4| + lv4=capital(lv4) + if @@the_idx_topics[lang][@lv0][@lv1][@lv2][@lv3][lv4].class==NilClass + @@the_idx_topics[lang][@lv0][@lv1][@lv2][@lv3][lv4]={ md: [] } + end + @lv4=lv4 if lev4.length==1 + j=@@the_idx_topics[lang][@lv0][@lv1][@lv2][@lv3][lv4][:md] + contents(lang,j,idx) if idx_lst_alt.length - 1 == t + end + end + end + end + end + end + @the_idx + end + end + class Output_index + require_relative 'i18n' # i18n.rb + def initialize(opt,the_idx) + @opt,@the_idx=opt,the_idx + @env=SiSU_Env::Info_env.new + @rc=SiSU_Env::Get_init.instance.sisu_yaml.rc + @alph=%W[9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z] + @letter=@alph.shift + @vz=SiSU_Env::Get_init.instance.skin + end + def html_file_open + @the_idx.keys.each do |lng| + @output ||={} + @output[lng] ||={} + harvest_pth,file='','' + if @env.output_dir_structure.by_language_code? + harvest_pth="#{@env.path.webserv}/#{@opt.base_stub}/#{lng}/manifest" + file="#{harvest_pth}/topics.html" + else @env.output_dir_structure.by_filetype? + harvest_pth="#{@env.path.webserv}/#{@opt.base_stub}/manifest" + file="#{harvest_pth}/topics.#{lng}.html" + end + FileUtils::mkdir_p(harvest_pth) unless FileTest.directory?(harvest_pth) + puts "file://#{file}" + @output[lng][:html]=File.new(file,'w') + if @opt.cmd.inspect =~/-M/ + @output[lng][:html_mnt]=File.new("#{@env.path.pwd}/topics.html",'w') + end + end + end + def html_file_close + @the_idx.keys.each do |lng| + @output[lng][:html].close + @output[lng][:html_mnt].close if @output[lng][:html_mnt].class==File + end + end + def html_print + def html_songsheet + html_file_open + html_head + html_alph + html_body + html_tail + html_file_close + end + def html_head_adjust(lng,type='') + css_path,authors='','' + if @env.output_dir_structure.by_language_code? + css_path=(type !~/maintenance/) \ + ? '../../_sisu/css/harvest.css' + : 'harvest.css' + authors='authors.html' + elsif @env.output_dir_structure.by_filetype? + css_path=(type !~/maintenance/) \ + ? '../_sisu/css/harvest.css' + : 'harvest.css' + authors="authors.#{lng}.html" + elsif @env.output_dir_structure.by_filename? + css_path=(type !~/maintenance/) \ + ? '../_sisu/css/harvest.css' + : 'harvest.css' + authors="authors.#{lng}.html" + end + ln=SiSU_i18n::Languages.new.language.list + harvest_languages='' + @the_idx.keys.each do |lng| + if @env.output_dir_structure.by_language_code? + harvest_pth="../../#{lng}/manifest" + file="#{harvest_pth}/topics.html" + else @env.output_dir_structure.by_filetype? + harvest_pth='.' + file="#{harvest_pth}/topics.#{lng}.html" + end + l=ln[lng][:t] + harvest_languages += %{#{l}   } + end + sv=SiSU_Env::Info_version.instance.get_version + < + + +SiSU Metadata Harvest - Topics + + + + + + + + + + + + +

SiSU Metadata Harvest - Topics

+

[ HOME ] also see SiSU Metadata Harvest - Authors

+

#{@env.widget_static.search_form}

+
+

#{harvest_languages}

+
+WOK + end + def html_head + @the_idx.keys.each do |lng| + @output[lng][:html_mnt] << html_head_adjust(lng,'maintenance') if @opt.cmd.inspect =~/M/ + @output[lng][:html] << html_head_adjust(lng) + end + end + def html_alph + a=[] + a << '

' + @alph.each do |x| + a << ((x =~/[0-9]/) \ + ? '' + : %{#{x}, }) + end + a=a.join + @the_idx.keys.each do |lng| + @output[lng][:html_mnt] << a if @opt.cmd.inspect =~/M/ + @output[lng][:html] << a + end + end + def html_tail + a =< + + + + + + +#{@vz.credits_sisu} + + +WOK + @the_idx.keys.each do |lng| + @output[lng][:html_mnt] << a if @output[lng][:html_mnt].class==File + @output[lng][:html] << a + end + end + def do_html(lng,html) + @output[lng][:html] << html + end + def do_html_maintenance(lng,html) + @output[lng][:html_mnt] << html if @output[lng][:html_mnt].class==File + end + def do_string(lng,attrib,string) + html=%{

#{string}

} + do_html(lng,html) + do_html_maintenance(lng,html) if @output[lng][:html_mnt].class==File + end + def do_string_default(lng,attrib,string) + html=%{

#{string}

} + do_html(lng,html) + end + def do_string_maintenance(lng,attrib,string) + html=%{

#{string}

} + do_html_maintenance(lng,html) if @output[lng][:html_mnt].class==File + end + def do_string_name(lng,attrib,string) + f=/^(\S)/.match(string)[1] + if @letter < f + while @letter < f + if @alph.length > 0 + @letter=@alph.shift + if @output[lng][:html_mnt].class==File + @output[lng][:html_mnt] << %{\n

#{@letter}

} + end + @output[lng][:html] << %{\n

#{@letter}

} + else break + end + end + end + name=string.strip.gsub(/\s+/,'_') + html=%{

#{string}

} + do_html(lng,html) + do_html_maintenance(lng,html) if @output[lng][:html_mnt].class==File + end + def do_array(lng,lv,array) + lv+=1 + array.each do |b| + do_case(lng,lv,b) + end + end + def do_hash_md(lng,attrib,hash) + if @env.output_dir_structure.by_language_code? + manifest_at=hash[:file] + '.html' + elsif @env.output_dir_structure.by_filetype? + manifest_at=hash[:file] + '.' + lng + '.html' + elsif @env.output_dir_structure.by_filename? + manifest_at="../#{hash[:file]}/#{hash[:page]}" + end + html=%{#{hash[:title]} - #{hash[:author]}} + do_string_default(lng,attrib,html) + end + def do_hash_md_maintenance(lng,attrib,hash) + if @output[lng][:html_mnt].class==File #should not be run for presentation output + html=%{[src]  #{hash[:title]} - #{hash[:author]}} + do_string_maintenance(lng,attrib,html) + end + end + def do_hash(lng,lv,hash) + lv+=1 + key=[] + hash.each_key do |m| + if m == :md + do_case(lng,lv,hash[m]) + elsif m != :title \ + and m != :author \ + and m != :filename \ + and m != :file \ + and m != :rough_idx \ + and m != :page + key << m + elsif m == :title + do_hash_md(lng,'work',hash) + do_hash_md_maintenance(lng,'work',hash) + end + end + if key.length > 0 + key.sort.each do |m| + attrib="lev#{lv}" + lv==0 ? do_string_name(lng,attrib,m) : do_string(lng,attrib,m) + do_case(lng,lv,hash[m]) + end + end + end + def do_case(lng,lv,a) + y = a.class + case + when y==String + attrib="lev#{lv}" + lv==0 ? do_string_name(lng,attrib,a) : do_string(lng,attrib,a) + when y==Array + do_array(lng,lv,a) + when y==Hash + do_hash(lng,lv,a) + end + end + def html_body + the_idx=@the_idx + the_idx.each_pair do |lng,lng_array| + lng_array.sort.each do |a| + do_case(lng,-1,a) + end + end + end + self + end + def screen_print + def do_string(lv,string) + s=' '*4 + puts s*lv + string + end + def do_array(lng,lv,array) + lv+=1 + array.each do |b| + do_case(lng,lv,b) + end + end + def do_hash_md(lng,lv,hash) + string=hash[:title] + ' - ' + hash[:author] + do_string(lng,lv,string) + end + def do_hash(lng,lv,hash) + lv+=1 + key=[] + hash.each_key do |m| + if m == :md + do_case(lng,lv,hash[m]) + elsif m != :title \ + and m != :author \ + and m != :filename \ + and m != :file \ + and m != :rough_idx \ + and m != :page + key << m + elsif m == :title + do_hash_md(lng,lv,hash) + end + end + if key.length > 0 + key.sort.each do |m| + do_string(lng,lv,m) + do_case(lng,lv,hash[m]) + end + end + end + def do_case(lng,lv,a) + s=' '*4 + y = a.class + case + when y==String + do_string(lng,lv,a) + when y==Array + do_array(lng,lv,a) + when y==Hash + do_hash(lng,lv,a) + end + end + def cycle + the_idx=@the_idx + the_idx.keys.each do |lng| + the_idx[lng].each do |a| + do_case(lng,-1,a) + end + end + end + self + end + def screen_print_unsorted + def do_string(lng,lv,string) + s=' '*4 + puts s*lv + string + end + def do_array(lng,lv,array) + lv+=1 + array.each do |b| + do_case(lng,lv,b) + end + end + def do_hash_md(lng,lv,hash) + string=hash[:title] + ' - ' + hash[:author] + do_string(lng,lv,string) + end + def do_hash(lng,lv,hash) + lv+=1 + hash.each_key do |m| + if m == :md + do_case(lng,lv,hash[m]) + else + if m != :title \ + and m != :author \ + and m != :filename \ + and m != :file \ + and m != :rough_idx \ + and m != :page + do_string(lng,lv,m) + do_case(lng,lv,hash[m]) + elsif m == :title + do_hash_md(lng,lv,hash) + else + end + end + end + end + def do_case(lng,lv,a) + s=' '*4 + y = a.class + case + when y==String + do_string(lng,lv,a) + when y==Array + do_array(lng,lv,a) + when y==Hash + do_hash(lng,lv,a) + end + end + def cycle + the_idx=@the_idx + the_idx.keys.each do |lng| + the_idx[lng].each do |a| + do_case(lng,-1,a) + end + end + end + self + end + end +end +__END__ +terms -|_ t{tl1} -|_ {fa}[fa]{filenames and other details} + | |_ {tl2} -|_ {fa}[fa]{filenames and other details} + | | |_{tl3} -|_ {fa}[fa]{filenames and other details} + | | | |_{tl4} - {fa}[fa]{filenames and other details} + | | | | + | | | |_{tl4a} - {fa}[fa]{filenames and other details} + | | | | + | | | |_{tl4b} - {fa}[fa]{filenames and other details} + | | | | + | | | |_ ... + | | | + | | |_{tl3a} - {fa}[fa]{filenames and other details} + | | + | |_{tl2a} - {fa}[fa]{filenames and other details} + | + |_ t{tl1a} -|_ {fa}[fa]{filenames and other details} + |_ ... -- cgit v1.2.3