diff options
Diffstat (limited to 'lib/sisu/v1/harvest_topics.rb')
-rw-r--r-- | lib/sisu/v1/harvest_topics.rb | 571 |
1 files changed, 571 insertions, 0 deletions
diff --git a/lib/sisu/v1/harvest_topics.rb b/lib/sisu/v1/harvest_topics.rb new file mode 100644 index 00000000..1ab62a95 --- /dev/null +++ b/lib/sisu/v1/harvest_topics.rb @@ -0,0 +1,571 @@ +# coding: utf-8 +=begin + + * Name: SiSU + + * Description: a framework for document structuring, publishing and search + metadata harvest, extract topics and associated writings from document set + (topics use topic_register header) + + * Author: Ralph Amissah + + * Copyright: (C) 1997 - 2009 Ralph Amissah All Rights Reserved. + + * License: GPL 3 or later: + + SiSU, a framework for document structuring, publishing and search + + Copyright (C) Ralph Amissah + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the Free + Software Foundation, either version 3 of the License, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + You should have received a copy of the GNU General Public License along with + this program. If not, see <http://www.gnu.org/licenses/>. + + If you have Internet connection, the latest version of the GPL should be + available at these locations: + <http://www.fsf.org/licensing/licenses/gpl.html> + <http://www.gnu.org/licenses/gpl.html> + + <http://www.jus.uio.no/sisu/gpl.fsf/toc.html> + <http://www.jus.uio.no/sisu/gpl.fsf/doc.html> + <http://www.jus.uio.no/sisu/gpl.fsf/plain.txt> + + * SiSU uses: + * Standard SiSU markup syntax, + * Standard SiSU meta-markup syntax, and the + * Standard SiSU object citation numbering and system + + * Hompages: + <http://www.jus.uio.no/sisu> + <http://www.sisudoc.org> + + * Download: + <http://www.jus.uio.no/sisu/SiSU/download.html> + + * Ralph Amissah + <ralph@amissah.com> + <ralph.amissah@gmail.com> + + ** Description: simple xml representation (sax style) + +=end +module HARVEST_topics + require "#{SiSU_lib}/author_format" + class Songsheet + def initialize(opt) + @opt=opt + @file_list=opt.files + @env=SiSU_Env::Info_env.new + end + def songsheet + files,idx_array=[],[] + @file_list.each do |f| + if f =~/.+?\.ss[tm]$/ + files << f[/(.+?\.ss[tm])$/,1] + else + print "not .sst or .ssm ? << #{f} >> " + end + end + files.each do |filename| + file_array=[] + File.open(filename,'r') do |file| + file.each_line("\n\n") do |line| + if line =~/^@\S+?: / + #line=line.gsub(/\n/,' ') + file_array << line + elsif line =~/^(?:\s*\n|%+ )/ + else break + end + end + end + #file_array=IO.readlines("#{filename}","\n\r") + idx_array=HARVEST_topics::Harvest.new(file_array,filename,idx_array).extract_harvest + end + the_idx=HARVEST_topics::Index.new(idx_array,@@the_idx_topics).construct_book_topic_index + #HARVEST_topics::Output_index.new(the_idx).screen_print.cycle + HARVEST_topics::Output_index.new(@opt,the_idx).html_print.html_songsheet + puts "file://#{@env.path.output_md_harvest}/harvest_topics.html" + puts "file://#{@env.path.pwd}/harvest_topics.html" if @opt.cmd.inspect =~/-M/ + end + end + class Harvest + def initialize(data,filename,idx_array) + @data,@filename,@idx_array=data,filename,idx_array + end + def extract_harvest + data,filename,idx_array=@data,@filename,@idx_array + @idx_lst,@title,@subtitle,@fulltitle,@author,@author_format=nil,nil,nil,nil,nil,nil + rgx={} + rgx[:author]=/^@(?:author|creator):\s+(.+)/ + rgx[:title]=/^@title:\s+(.+)/ + rgx[:subtitle]=/^@subtitle:\s+(.+)/ + rgx[:idx]=/^@topic_register:\s+(.+)/ + data.each do |para| + if para=~ rgx[:idx] + @idx_list=rgx[:idx].match(para)[1] + end + if para=~ rgx[:title] + @title=rgx[:title].match(para)[1] + end + if para=~ rgx[:subtitle] + @subtitle=rgx[:subtitle].match(para)[1] + end + if para=~ rgx[:author] + @author_format=rgx[:author].match(para)[1] + end + break if @title and @subtitle and @author and @idx_lst + end + @fulltitle=if @subtitle + @title + ' - ' + @subtitle + else @title + end + if @title and @author_format and @idx_list + creator=FORMAT::Author.new(@author_format.strip).author_details + @authors,@authorship=creator[:authors],creator[:authorship] + file=if filename=~/~[a-z]{2,3}\.ss[mt]$/ + lang='.' + /~([a-z]{2,3})\.ss[mt]$/.match(filename)[1] + filename.sub(/~[a-z]{2,3}\.ss[mt]$/,'') + else + lang='' + filename.sub(/\.ss[mt]$/,'') + end + page="sisu_manifest#{lang}.html" + idx_array <<=if @idx_list =~/;/ + g=@idx_list.scan(/[^;]+/) + idxl=[] + g.each do |i| + i.strip! + idxl << { :filename => filename, :file => file, :rough_idx => i, :title => @fulltitle, :author => creator, :page => page} + end + idxl + else { :filename => filename, :file => file, :rough_idx => @idx_list, :title => @fulltitle, :author => creator, :page => page } + end + else + p "missing author field: #@filename title: #@title; author: #@author_format; idx: #@idx_list" + end + idx_array.flatten! + idx_array + end + end + class Index + def initialize(idx_array,the_idx) + @idx_array,@the_idx=idx_array,the_idx + @@the_idx_topics=@the_idx + end + def capital(txt) + txt[0].chr.capitalize + txt[1,txt.length] + end + def contents(hash,idx) + names='' + idx[:author][:last_first_format_a].each do |n| + s=n.sub(/(.+?)(?:,.+|$)/,'\1').gsub(/\s+/,'_') + names += %{<a href="harvest_authors.html##{s}">#{n}</a>, } + end + hash << { :filename => idx[:filename], :file => idx[:file], :author => names, :title => idx[:title], :page => idx[:page] } + end + def construct_book_topic_index + idx_array=@idx_array + idx_array.each do |idx| + @lv0,@lv1,@lv2,@lv3,@lv4={},{},{},{},{} + if idx[:rough_idx] + idx_lst=idx[:rough_idx].scan(/[^:]+/) + else + puts "no topic register in: << #{idx[:filename]} >>" + next + end + idx_lst_alt=[] + idx_lst.each {|lev| idx_lst_alt << lev.scan(/[^|]+/)} + depth = idx_lst_alt.length - 1 + range = 0..depth + range.each do |t| + if idx_lst_alt[t] + case t + when 0 + lev0=idx_lst_alt[t] + lev0.each do |lv0| + lv0=capital(lv0) + if @@the_idx_topics[lv0].class==NilClass + @@the_idx_topics[lv0]={:md => []} + end + @lv0=lv0 if lev0.length == 1 + j=@@the_idx_topics[lv0][:md] + contents(j,idx) if idx_lst_alt.length - 1 == t + end + when 1 + lev1=idx_lst_alt[t] + lev1.each do |lv1| + lv1=capital(lv1) + if @@the_idx_topics[@lv0][lv1].class==NilClass + @@the_idx_topics[@lv0][lv1]={:md => []} + end + @lv1=lv1 if lev1.length == 1 + j=@@the_idx_topics[@lv0][lv1][:md] + contents(j,idx) if idx_lst_alt.length - 1 == t + end + when 2 + lev2=idx_lst_alt[t] + lev2.each do |lv2| + lv2=capital(lv2) + if @@the_idx_topics[@lv0][@lv1][lv2].class==NilClass + @@the_idx_topics[@lv0][@lv1][lv2]={:md => []} + end + @lv2=lv2 if lev2.length == 1 + j=@@the_idx_topics[@lv0][@lv1][lv2][:md] + contents(j,idx) if idx_lst_alt.length - 1 == t + end + when 3 + lev3=idx_lst_alt[t] + lev3.each do |lv3| + lv3=capital(lv3) + if @@the_idx_topics[@lv0][@lv1][@lv2][lv3].class==NilClass + @@the_idx_topics[@lv0][@lv1][@lv2][lv3]={:md => []} + end + @lv3=lv3 if lev3.length == 1 + j=@@the_idx_topics[@lv0][@lv1][@lv2][lv3][:md] + contents(j,idx) if idx_lst_alt.length - 1 == t + end + when 4 + lev4=idx_lst_alt[t] + lev4.each do |lv4| + lv4=capital(lv4) + if @@the_idx_topics[@lv0][@lv1][@lv2][@lv3][lv4].class==NilClass + @@the_idx_topics[@lv0][@lv1][@lv2][@lv3][lv4]={:md => []} + end + @lv4=lv4 if lev4.length == 1 + j=@@the_idx_topics[@lv0][@lv1][@lv2][@lv3][lv4][:md] + contents(j,idx) if idx_lst_alt.length - 1 == t + end + end + end + end + end + @the_idx + end + end + class Output_index + def initialize(opt,the_idx) + @opt,@the_idx=opt,the_idx + @env=SiSU_Env::Info_env.new + @rc=Get_init.instance.yamlrc + @alph=%W[9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z] + @letter=@alph.shift + @vz=SiSU_Env::Get_init.instance.skin + end + def html_file_open + @output={} + @output[:html]=File.new("#{@env.path.output_md_harvest}/harvest_topics.html",'w') + if @opt.cmd.inspect =~/-M/ + @output[:html_mnt]=File.new("#{@env.path.pwd}/harvest_topics.html",'w') + end + end + def html_file_close + @output[:html].close + @output[:html_mnt].close if @output[:html_mnt].class == File + end + def html_print + def html_songsheet + html_file_open + html_head + html_alph + html_body + html_tail + html_file_close + end + def html_head_adjust(type='') + css_path=if type !~/maintenance/ + '../_sisu/css/harvest.css' + else 'harvest.css' + end + sv=SiSU_Env::Info_version.instance.get_version + <<WOK +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head> +<title>SiSU Metadata Harvest - Topics</title> +<meta http-equiv="Content-Type" content="text/html;charset=utf-8" /> +<meta name="dc.title" content= "SiSU metadata harvest, Topics - SiSU information Structuring Universe, Structured information Serialised Units" /> +<meta name="dc.subject" content= "document structuring, ebook, publishing, PDF, LaTeX, XML, ODF, SQL, postgresql, sqlite, electronic book, electronic publishing, electronic document, electronic citation, data structure, citation systems, granular search, digital library" /> +<meta name="generator" content="#{sv[:project]} #{sv[:version]} of #{sv[:date_stamp]} (n*x and Ruby!)" /> +<link rel="generator" href="http://www.jus.uio.no/sisu/SiSU" /> +<link rel="stylesheet" href="#{css_path}" type="text/css" /> +<link rel="shortcut icon" href="../_sisu/image/rb7.ico" /> +</head> +<body bgcolor="#ffffff" text="#000000" link="#003090" lang="en" xml:lang="en"> +<a name="top" id="top"></a> +<a name="up" id="up"></a> +<a name="start" id="start"></a> +<h1>SiSU Metadata Harvest - Topics</h1> +<p>[<a href="../index.html"> HOME </a>] also see <a href="harvest_authors.html">SiSU Metadata Harvest - Authors</a></p> +<hr /> +WOK + end + def html_head + @output[:html_mnt] << html_head_adjust('maintenance') if @opt.cmd.inspect =~/-M/ + @output[:html] << html_head_adjust + end + def html_alph + a=[] + a << '<p>' + @alph.each do |x| + a << if x =~/[0-9]/; '' + else + %{<a href="##{x}">#{x}</a>, } + end + end + @output[:html_mnt] << a if @opt.cmd.inspect =~/-M/ + @output[:html] << a.join + end + def html_tail + a=[] + a <<<<WOK +<hr /> +<a name="bottom" id="bottom"></a> +<a name="down" id="down"></a> +<a name="end" id="end"></a> +<a name="finish" id="finish"></a> +<a name="stop" id="stop"></a> +<a name="credits"></a> +#{@vz.credits_sisu} +</body> +</html> +WOK + @output[:html_mnt] << a if @output[:html_mnt].class == File + @output[:html] << a + end + def do_html(html) + @output[:html] << html + end + def do_html_maintenance(html) + @output[:html_mnt] << html if @output[:html_mnt].class == File + end + def do_string(attrib,string) + html=%{<p class="#{attrib}">#{string}</p>} + do_html(html) + do_html_maintenance(html) if @output[:html_mnt].class == File + end + def do_string_default(attrib,string) + html=%{<p class="#{attrib}">#{string}</p>} + do_html(html) + end + def do_string_maintenance(attrib,string) + html=%{<p class="#{attrib}">#{string}</p>} + do_html_maintenance(html) if @output[:html_mnt].class == File + end + def do_string_name(attrib,string) + f=/^(\S)/.match(string)[1] + if @letter < f + while @letter < f + if @alph.length > 0 + @letter=@alph.shift + if @output[:html_mnt].class == File + @output[:html_mnt] << %{\n<p class="letter"><a name="#{@letter}">#{@letter}</a></p><p class="book_index_lev1"><a name="#{@letter.downcase}"></a></p>} + end + @output[:html] << %{\n<p class="letter"><a name="#{@letter}">#{@letter}</a></p><p class="book_index_lev1"><a name="#{@letter.downcase}"></a></p>} + else break + end + end + end + name=string.strip.gsub(/\s+/,'_') + html=%{<p class="#{attrib}"><a name="#{name}">#{string}</a></p>} + do_html(html) + do_html_maintenance(html) if @output[:html_mnt].class == File + end + def do_array(lv,array) + lv+=1 + array.each do |b| + do_case(lv,b) + end + end + def do_hash_md(attrib,hash) + html=%{<a href="../#{hash[:file]}/#{hash[:page]}">#{hash[:title]}</a> - #{hash[:author]}} + do_string_default(attrib,html) + end + def do_hash_md_maintenance(attrib,hash) + if @output[:html_mnt].class == File #should not be run for presentation output + html=%{[<a href="#{hash[:file]}.sst">src</a>] <a href="file://#{@env.path.output}/#{hash[:file]}/#{hash[:page]}">#{hash[:title]}</a> - #{hash[:author]}} + do_string_maintenance(attrib,html) + end + end + def do_hash(lv,hash) + lv+=1 + key=[] + hash.each_key do |m| + if m == :md + do_case(lv,hash[m]) + elsif m != :title and m != :author and m != :filename and m != :file and m != :rough_idx and m != :page + key << m + elsif m == :title + do_hash_md('work',hash) + do_hash_md_maintenance('work',hash) + end + end + if key.length > 0 + key.sort.each do |m| + attrib="lev#{lv}" + if lv == 0 + do_string_name(attrib,m) + else do_string(attrib,m) + end + do_case(lv,hash[m]) + end + end + end + def do_case(lv,a) + y = a.class + case + when y == String + attrib="lev#{lv}" + if lv == 0 + do_string_name(attrib,a) + else do_string(attrib,a) + end + #do_string_name(attrib,a) + when y == Array + do_array(lv,a) + when y == Hash + do_hash(lv,a) + end + end + def html_body + the_idx=@the_idx + the_idx.sort.each do |a| + do_case(-1,a) + end + end + self + end + def screen_print + def do_string(lv,string) + s=' '*4 + puts s*lv + string + end + def do_array(lv,array) + lv+=1 + array.each do |b| + do_case(lv,b) + end + end + def do_hash_md(lv,hash) + string=hash[:title] + ' - ' + hash[:author] + do_string(lv,string) + end + def do_hash(lv,hash) + lv+=1 + key=[] + hash.each_key do |m| + if m == :md + do_case(lv,hash[m]) + elsif m != :title and m != :author and m != :filename and m != :file and m != :rough_idx and m != :page + key << m + elsif m == :title + do_hash_md(lv,hash) + end + end + if key.length > 0 + key.sort.each do |m| + do_string(lv,m) + do_case(lv,hash[m]) + end + end + end + def do_case(lv,a) + s=' '*4 + y = a.class + case + when y == String + do_string(lv,a) + when y == Array + do_array(lv,a) + when y == Hash + do_hash(lv,a) + end + end + def cycle + the_idx=@the_idx + the_idx.each do |a| + do_case(-1,a) + end + end + self + end + def screen_print_unsorted + def do_string(lv,string) + s=' '*4 + puts s*lv + string + end + def do_array(lv,array) + lv+=1 + array.each do |b| + do_case(lv,b) + end + end + def do_hash_md(lv,hash) + string=hash[:title] + ' - ' + hash[:author] + do_string(lv,string) + end + def do_hash(lv,hash) + lv+=1 + hash.each_key do |m| + if m == :md + do_case(lv,hash[m]) + else + if m != :title and m != :author and m != :filename and m != :file and m != :rough_idx and m != :page + do_string(lv,m) + do_case(lv,hash[m]) + elsif m == :title + do_hash_md(lv,hash) + else + end + end + end + end + def do_case(lv,a) + s=' '*4 + y = a.class + case + when y == String + do_string(lv,a) + when y == Array + do_array(lv,a) + when y == Hash + do_hash(lv,a) + end + end + def cycle + the_idx=@the_idx + the_idx.each do |a| + do_case(-1,a) + end + end + self + end + end +end +__END__ +terms -|_ t{tl1} -|_ {fa}[fa]{filenames and other details} + | |_ {tl2} -|_ {fa}[fa]{filenames and other details} + | | |_{tl3} -|_ {fa}[fa]{filenames and other details} + | | | |_{tl4} - {fa}[fa]{filenames and other details} + | | | | + | | | |_{tl4a} - {fa}[fa]{filenames and other details} + | | | | + | | | |_{tl4b} - {fa}[fa]{filenames and other details} + | | | | + | | | |_ ... + | | | + | | |_{tl3a} - {fa}[fa]{filenames and other details} + | | + | |_{tl2a} - {fa}[fa]{filenames and other details} + | + |_ t{tl1a} -|_ {fa}[fa]{filenames and other details} + |_ ... |