aboutsummaryrefslogtreecommitdiffhomepage
path: root/lib/sisu/v2/harvest_topics.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/sisu/v2/harvest_topics.rb')
-rw-r--r--lib/sisu/v2/harvest_topics.rb559
1 files changed, 559 insertions, 0 deletions
diff --git a/lib/sisu/v2/harvest_topics.rb b/lib/sisu/v2/harvest_topics.rb
new file mode 100644
index 00000000..cf913b80
--- /dev/null
+++ b/lib/sisu/v2/harvest_topics.rb
@@ -0,0 +1,559 @@
+# coding: utf-8
+=begin
+
+ * Name: SiSU
+
+ * Description: a framework for document structuring, publishing and search
+ metadata harvest, extract topics and associated writings from document set
+ (topics use topic_register header)
+
+ * Author: Ralph Amissah
+
+ * Copyright: (C) 1997 - 2010, Ralph Amissah, All Rights Reserved.
+
+ * License: GPL 3 or later:
+
+ SiSU, a framework for document structuring, publishing and search
+
+ Copyright (C) Ralph Amissah
+
+ This program is free software: you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the Free
+ Software Foundation, either version 3 of the License, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ You should have received a copy of the GNU General Public License along with
+ this program. If not, see <http://www.gnu.org/licenses/>.
+
+ If you have Internet connection, the latest version of the GPL should be
+ available at these locations:
+ <http://www.fsf.org/licensing/licenses/gpl.html>
+ <http://www.gnu.org/licenses/gpl.html>
+
+ <http://www.jus.uio.no/sisu/gpl.fsf/toc.html>
+ <http://www.jus.uio.no/sisu/gpl.fsf/doc.html>
+ <http://www.jus.uio.no/sisu/gpl.fsf/plain.txt>
+
+ * SiSU uses:
+ * Standard SiSU markup syntax,
+ * Standard SiSU meta-markup syntax, and the
+ * Standard SiSU object citation numbering and system
+
+ * Hompages:
+ <http://www.jus.uio.no/sisu>
+ <http://www.sisudoc.org>
+
+ * Download:
+ <http://www.jus.uio.no/sisu/SiSU/download.html>
+
+ * Ralph Amissah
+ <ralph@amissah.com>
+ <ralph.amissah@gmail.com>
+
+ ** Description: simple xml representation (sax style)
+
+=end
+module HARVEST_topics
+ require "#{SiSU_lib}/author_format" # author_format.rb
+ class Songsheet
+ def initialize(opt)
+ @opt=opt
+ @file_list=opt.files
+ @env=SiSU_Env::Info_env.new
+ end
+ def songsheet
+ files,idx_array=[],[]
+ @file_list.each do |f|
+ if f =~/.+?\.ss[tm]$/
+ files << f[/(.+?\.ss[tm])$/,1]
+ else
+ print "not .sst or .ssm ? << #{f} >> "
+ end
+ end
+ files.each do |filename|
+ file_array=[]
+ File.open(filename,'r') do |file|
+ file.each_line("\n\n") do |line|
+ if line =~/^@(?:title|creator|classify):(?:\s|$)/m
+ file_array << line
+ elsif line =~/^@\S+?:(?:\s|$)/m \
+ or line =~/^(?:\s*\n|%+ )/
+ else break
+ end
+ end
+ end
+ idx_array=HARVEST_topics::Harvest.new(file_array,filename,idx_array).extract_harvest
+ end
+ the_idx=HARVEST_topics::Index.new(idx_array,@@the_idx_topics).construct_book_topic_index
+ HARVEST_topics::Output_index.new(@opt,the_idx).html_print.html_songsheet
+ puts "file://#{@env.path.output_md_harvest}/harvest_topics.html"
+ puts "file://#{@env.path.pwd}/harvest_topics.html" if @opt.cmd.inspect =~/-M/
+ end
+ end
+ class Harvest
+ def initialize(data,filename,idx_array)
+ @data,@filename,@idx_array=data,filename,idx_array
+ end
+ def extract_harvest
+ data,filename,idx_array=@data,@filename,@idx_array
+ @idx_lst,@title,@subtitle,@fulltitle,@author,@author_format=nil,nil,nil,nil,nil,nil
+ rgx={}
+ rgx[:author]=/^@creator:(?:[ ]+|.+?:author:[ ]+)(.+?)(?:\||\n)/m
+ rgx[:title]=/^@title:[ ]+(.+)/
+ rgx[:subtitle]=/^@title:.+?:subtitle:[ ]+(.+?)\n/m
+ rgx[:idx]=/^@classify:.+?:topic_register:[ ]+(.+?)\n/m
+ data.each do |para|
+ if para=~ rgx[:idx]
+ @idx_list=rgx[:idx].match(para)[1]
+ end
+ if para=~ rgx[:title]
+ @title=rgx[:title].match(para)[1]
+ end
+ if para=~ rgx[:subtitle]
+ @subtitle=rgx[:subtitle].match(para)[1]
+ end
+ if para=~ rgx[:author]
+ @author_format=rgx[:author].match(para)[1]
+ end
+ break if @title and @subtitle and @author and @idx_lst #and @date
+ end
+ @fulltitle=(@subtitle ? (@title + ' - ' + @subtitle) : @title)
+ if @title and @author_format and @idx_list
+ creator=FORMAT::Author.new(@author_format.strip).author_details
+ @authors,@authorship=creator[:authors],creator[:authorship]
+ file=if filename=~/~[a-z]{2,3}\.ss[mt]$/
+ lang='.' + /~([a-z]{2,3})\.ss[mt]$/.match(filename)[1]
+ filename.sub(/~[a-z]{2,3}\.ss[mt]$/,'')
+ else
+ lang=''
+ filename.sub(/\.ss[mt]$/,'')
+ end
+ page="sisu_manifest#{lang}.html"
+ idx_array <<=if @idx_list =~/;/
+ g=@idx_list.scan(/[^;]+/)
+ idxl=[]
+ g.each do |i|
+ i.strip!
+ idxl << { :filename =>filename,:file =>file,:rough_idx =>i,:title =>@fulltitle,:author =>creator,:page =>page}
+ end
+ idxl
+ else { :filename =>filename,:file =>file,:rough_idx =>@idx_list,:title =>@fulltitle,:author =>creator,:page =>page }
+ end
+ else
+ p "missing required field in #{@filename} - [title]: <<#{@title}>>; [author]: <<#{@author_format}>>; [idx]: <<#{@idx_list}>>"
+ end
+ idx_array.flatten!
+ idx_array
+ end
+ end
+ class Index
+ def initialize(idx_array,the_idx)
+ @idx_array,@the_idx=idx_array,the_idx
+ @@the_idx_topics=@the_idx
+ end
+ def capital(txt)
+ txt[0].chr.capitalize + txt[1,txt.length]
+ end
+ def contents(hash,idx)
+ names=''
+ idx[:author][:last_first_format_a].each do |n|
+ s=n.sub(/(.+?)(?:,.+|$)/,'\1').gsub(/\s+/,'_')
+ names += %{<a href="harvest_authors.html##{s}">#{n}</a>, }
+ end
+ hash << { :filename =>idx[:filename],:file =>idx[:file],:author => names,:title =>idx[:title],:page =>idx[:page] }
+ end
+ def construct_book_topic_index
+ idx_array=@idx_array
+ idx_array.each do |idx|
+ @lv0,@lv1,@lv2,@lv3,@lv4={},{},{},{},{}
+ if idx[:rough_idx]
+ idx_lst=idx[:rough_idx].scan(/[^:]+/)
+ else
+ puts "no topic register in: << #{idx[:filename]} >>"
+ next
+ end
+ idx_lst_alt=[]
+ idx_lst.each {|lev| idx_lst_alt << lev.scan(/[^|]+/)}
+ depth = idx_lst_alt.length - 1
+ range = 0..depth
+ range.each do |t|
+ if idx_lst_alt[t]
+ case t
+ when 0
+ lev0=idx_lst_alt[t]
+ lev0.each do |lv0|
+ lv0=capital(lv0)
+ if @@the_idx_topics[lv0].class==NilClass
+ @@the_idx_topics[lv0]={:md => []}
+ end
+ @lv0=lv0 if lev0.length==1
+ j=@@the_idx_topics[lv0][:md]
+ contents(j,idx) if idx_lst_alt.length - 1 == t
+ end
+ when 1
+ lev1=idx_lst_alt[t]
+ lev1.each do |lv1|
+ lv1=capital(lv1)
+ if @@the_idx_topics[@lv0][lv1].class==NilClass
+ @@the_idx_topics[@lv0][lv1]={:md => []}
+ end
+ @lv1=lv1 if lev1.length==1
+ j=@@the_idx_topics[@lv0][lv1][:md]
+ contents(j,idx) if idx_lst_alt.length - 1 == t
+ end
+ when 2
+ lev2=idx_lst_alt[t]
+ lev2.each do |lv2|
+ lv2=capital(lv2)
+ if @@the_idx_topics[@lv0][@lv1][lv2].class==NilClass
+ @@the_idx_topics[@lv0][@lv1][lv2]={:md => []}
+ end
+ @lv2=lv2 if lev2.length==1
+ j=@@the_idx_topics[@lv0][@lv1][lv2][:md]
+ contents(j,idx) if idx_lst_alt.length - 1 == t
+ end
+ when 3
+ lev3=idx_lst_alt[t]
+ lev3.each do |lv3|
+ lv3=capital(lv3)
+ if @@the_idx_topics[@lv0][@lv1][@lv2][lv3].class==NilClass
+ @@the_idx_topics[@lv0][@lv1][@lv2][lv3]={:md => []}
+ end
+ @lv3=lv3 if lev3.length==1
+ j=@@the_idx_topics[@lv0][@lv1][@lv2][lv3][:md]
+ contents(j,idx) if idx_lst_alt.length - 1 == t
+ end
+ when 4
+ lev4=idx_lst_alt[t]
+ lev4.each do |lv4|
+ lv4=capital(lv4)
+ if @@the_idx_topics[@lv0][@lv1][@lv2][@lv3][lv4].class==NilClass
+ @@the_idx_topics[@lv0][@lv1][@lv2][@lv3][lv4]={:md => []}
+ end
+ @lv4=lv4 if lev4.length==1
+ j=@@the_idx_topics[@lv0][@lv1][@lv2][@lv3][lv4][:md]
+ contents(j,idx) if idx_lst_alt.length - 1 == t
+ end
+ end
+ end
+ end
+ end
+ @the_idx
+ end
+ end
+ class Output_index
+ def initialize(opt,the_idx)
+ @opt,@the_idx=opt,the_idx
+ @env=SiSU_Env::Info_env.new
+ @rc=Get_init.instance.yamlrc
+ @alph=%W[9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z]
+ @letter=@alph.shift
+ @vz=SiSU_Env::Get_init.instance.skin
+ end
+ def html_file_open
+ @output={}
+ @output[:html]=File.new("#{@env.path.output_md_harvest}/harvest_topics.html",'w')
+ if @opt.cmd.inspect =~/-M/
+ @output[:html_mnt]=File.new("#{@env.path.pwd}/harvest_topics.html",'w')
+ end
+ end
+ def html_file_close
+ @output[:html].close
+ @output[:html_mnt].close if @output[:html_mnt].class==File
+ end
+ def html_print
+ def html_songsheet
+ html_file_open
+ html_head
+ html_alph
+ html_body
+ html_tail
+ html_file_close
+ end
+ def html_head_adjust(type='')
+ css_path=if type !~/maintenance/
+ '../_sisu/css/harvest.css'
+ else 'harvest.css'
+ end
+ sv=SiSU_Env::Info_version.instance.get_version
+ <<WOK
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<title>SiSU Metadata Harvest - Topics</title>
+<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
+<meta name="dc.title" content= "SiSU metadata harvest, Topics - SiSU information Structuring Universe, Structured information Serialised Units" />
+<meta name="dc.subject" content= "document structuring, ebook, publishing, PDF, LaTeX, XML, ODF, SQL, postgresql, sqlite, electronic book, electronic publishing, electronic document, electronic citation, data structure, citation systems, granular search, digital library" />
+<meta name="generator" content="#{sv[:project]} #{sv[:version]} of #{sv[:date_stamp]} (n*x and Ruby!)" />
+<link rel="generator" href="http://www.jus.uio.no/sisu/SiSU" />
+<link rel="stylesheet" href="#{css_path}" type="text/css" />
+<link rel="shortcut icon" href="../_sisu/image/rb7.ico" />
+</head>
+<body bgcolor="#ffffff" text="#000000" link="#003090" lang="en" xml:lang="en">
+<a name="top" id="top"></a>
+<a name="up" id="up"></a>
+<a name="start" id="start"></a>
+<h1>SiSU Metadata Harvest - Topics</h1>
+<p>[<a href="../index.html">&nbsp;HOME&nbsp;</a>] also see <a href="harvest_authors.html">SiSU Metadata Harvest - Authors</a></p>
+<hr />
+WOK
+ end
+ def html_head
+ @output[:html_mnt] << html_head_adjust('maintenance') if @opt.cmd.inspect =~/-M/
+ @output[:html] << html_head_adjust
+ end
+ def html_alph
+ a=[]
+ a << '<p>'
+ @alph.each do |x|
+ a << if x =~/[0-9]/; ''
+ else
+ %{<a href="##{x}">#{x}</a>,&nbsp;}
+ end
+ end
+ @output[:html_mnt] << a if @opt.cmd.inspect =~/-M/
+ @output[:html] << a.join
+ end
+ def html_tail
+ a=[]
+ a <<<<WOK
+<hr />
+<a name="bottom" id="bottom"></a>
+<a name="down" id="down"></a>
+<a name="end" id="end"></a>
+<a name="finish" id="finish"></a>
+<a name="stop" id="stop"></a>
+<a name="credits"></a>
+#{@vz.credits_sisu}
+</body>
+</html>
+WOK
+ @output[:html_mnt] << a if @output[:html_mnt].class==File
+ @output[:html] << a
+ end
+ def do_html(html)
+ @output[:html] << html
+ end
+ def do_html_maintenance(html)
+ @output[:html_mnt] << html if @output[:html_mnt].class==File
+ end
+ def do_string(attrib,string)
+ html=%{<p class="#{attrib}">#{string}</p>}
+ do_html(html)
+ do_html_maintenance(html) if @output[:html_mnt].class==File
+ end
+ def do_string_default(attrib,string)
+ html=%{<p class="#{attrib}">#{string}</p>}
+ do_html(html)
+ end
+ def do_string_maintenance(attrib,string)
+ html=%{<p class="#{attrib}">#{string}</p>}
+ do_html_maintenance(html) if @output[:html_mnt].class==File
+ end
+ def do_string_name(attrib,string)
+ f=/^(\S)/.match(string)[1]
+ if @letter < f
+ while @letter < f
+ if @alph.length > 0
+ @letter=@alph.shift
+ if @output[:html_mnt].class==File
+ @output[:html_mnt] << %{\n<p class="letter"><a name="#{@letter}">#{@letter}</a></p><p class="book_index_lev1"><a name="#{@letter.downcase}"></a></p>}
+ end
+ @output[:html] << %{\n<p class="letter"><a name="#{@letter}">#{@letter}</a></p><p class="book_index_lev1"><a name="#{@letter.downcase}"></a></p>}
+ else break
+ end
+ end
+ end
+ name=string.strip.gsub(/\s+/,'_')
+ html=%{<p class="#{attrib}"><a name="#{name}">#{string}</a></p>}
+ do_html(html)
+ do_html_maintenance(html) if @output[:html_mnt].class==File
+ end
+ def do_array(lv,array)
+ lv+=1
+ array.each do |b|
+ do_case(lv,b)
+ end
+ end
+ def do_hash_md(attrib,hash)
+ html=%{<a href="../#{hash[:file]}/#{hash[:page]}">#{hash[:title]}</a> - #{hash[:author]}}
+ do_string_default(attrib,html)
+ end
+ def do_hash_md_maintenance(attrib,hash)
+ if @output[:html_mnt].class==File #should not be run for presentation output
+ html=%{[<a href="#{hash[:file]}.sst">src</a>]&nbsp;&nbsp;<a href="file://#{@env.path.output}/#{hash[:file]}/#{hash[:page]}">#{hash[:title]}</a> - #{hash[:author]}}
+ do_string_maintenance(attrib,html)
+ end
+ end
+ def do_hash(lv,hash)
+ lv+=1
+ key=[]
+ hash.each_key do |m|
+ if m == :md
+ do_case(lv,hash[m])
+ elsif m != :title and m != :author and m != :filename and m != :file and m != :rough_idx and m != :page
+ key << m
+ elsif m == :title
+ do_hash_md('work',hash)
+ do_hash_md_maintenance('work',hash)
+ end
+ end
+ if key.length > 0
+ key.sort.each do |m|
+ attrib="lev#{lv}"
+ lv==(0 ? do_string_name(attrib,m) : do_string(attrib,m))
+ do_case(lv,hash[m])
+ end
+ end
+ end
+ def do_case(lv,a)
+ y = a.class
+ case
+ when y==String
+ attrib="lev#{lv}"
+ lv==(0 ? do_string_name(attrib,a) : do_string(attrib,a))
+ when y==Array
+ do_array(lv,a)
+ when y==Hash
+ do_hash(lv,a)
+ end
+ end
+ def html_body
+ the_idx=@the_idx
+ the_idx.sort.each do |a|
+ do_case(-1,a)
+ end
+ end
+ self
+ end
+ def screen_print
+ def do_string(lv,string)
+ s=' '*4
+ puts s*lv + string
+ end
+ def do_array(lv,array)
+ lv+=1
+ array.each do |b|
+ do_case(lv,b)
+ end
+ end
+ def do_hash_md(lv,hash)
+ string=hash[:title] + ' - ' + hash[:author]
+ do_string(lv,string)
+ end
+ def do_hash(lv,hash)
+ lv+=1
+ key=[]
+ hash.each_key do |m|
+ if m == :md
+ do_case(lv,hash[m])
+ elsif m != :title and m != :author and m != :filename and m != :file and m != :rough_idx and m != :page
+ key << m
+ elsif m == :title
+ do_hash_md(lv,hash)
+ end
+ end
+ if key.length > 0
+ key.sort.each do |m|
+ do_string(lv,m)
+ do_case(lv,hash[m])
+ end
+ end
+ end
+ def do_case(lv,a)
+ s=' '*4
+ y = a.class
+ case
+ when y==String
+ do_string(lv,a)
+ when y==Array
+ do_array(lv,a)
+ when y==Hash
+ do_hash(lv,a)
+ end
+ end
+ def cycle
+ the_idx=@the_idx
+ the_idx.each do |a|
+ do_case(-1,a)
+ end
+ end
+ self
+ end
+ def screen_print_unsorted
+ def do_string(lv,string)
+ s=' '*4
+ puts s*lv + string
+ end
+ def do_array(lv,array)
+ lv+=1
+ array.each do |b|
+ do_case(lv,b)
+ end
+ end
+ def do_hash_md(lv,hash)
+ string=hash[:title] + ' - ' + hash[:author]
+ do_string(lv,string)
+ end
+ def do_hash(lv,hash)
+ lv+=1
+ hash.each_key do |m|
+ if m == :md
+ do_case(lv,hash[m])
+ else
+ if m != :title and m != :author and m != :filename and m != :file and m != :rough_idx and m != :page
+ do_string(lv,m)
+ do_case(lv,hash[m])
+ elsif m == :title
+ do_hash_md(lv,hash)
+ else
+ end
+ end
+ end
+ end
+ def do_case(lv,a)
+ s=' '*4
+ y = a.class
+ case
+ when y==String
+ do_string(lv,a)
+ when y==Array
+ do_array(lv,a)
+ when y==Hash
+ do_hash(lv,a)
+ end
+ end
+ def cycle
+ the_idx=@the_idx
+ the_idx.each do |a|
+ do_case(-1,a)
+ end
+ end
+ self
+ end
+ end
+end
+__END__
+terms -|_ t{tl1} -|_ {fa}[fa]{filenames and other details}
+ | |_ {tl2} -|_ {fa}[fa]{filenames and other details}
+ | | |_{tl3} -|_ {fa}[fa]{filenames and other details}
+ | | | |_{tl4} - {fa}[fa]{filenames and other details}
+ | | | |
+ | | | |_{tl4a} - {fa}[fa]{filenames and other details}
+ | | | |
+ | | | |_{tl4b} - {fa}[fa]{filenames and other details}
+ | | | |
+ | | | |_ ...
+ | | |
+ | | |_{tl3a} - {fa}[fa]{filenames and other details}
+ | |
+ | |_{tl2a} - {fa}[fa]{filenames and other details}
+ |
+ |_ t{tl1a} -|_ {fa}[fa]{filenames and other details}
+ |_ ...