c&d: small fixes
[software/sisu] / lib / sisu / develop / db_sqltxt.rb
1 # encoding: utf-8
2 =begin
3
4 * Name: SiSU
5
6 ** Description: documents, structuring, processing, publishing, search
7 *** system environment, resource control and configuration details
8
9 ** Author: Ralph Amissah
10 <ralph@amissah.com>
11 <ralph.amissah@gmail.com>
12
13 ** Copyright: (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
14 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Ralph Amissah,
15 All Rights Reserved.
16
17 ** License: GPL 3 or later:
18
19 SiSU, a framework for document structuring, publishing and search
20
21 Copyright (C) Ralph Amissah
22
23 This program is free software: you can redistribute it and/or modify it
24 under the terms of the GNU General Public License as published by the Free
25 Software Foundation, either version 3 of the License, or (at your option)
26 any later version.
27
28 This program is distributed in the hope that it will be useful, but WITHOUT
29 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
30 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
31 more details.
32
33 You should have received a copy of the GNU General Public License along with
34 this program. If not, see <http://www.gnu.org/licenses/>.
35
36 If you have Internet connection, the latest version of the GPL should be
37 available at these locations:
38 <http://www.fsf.org/licensing/licenses/gpl.html>
39 <http://www.gnu.org/licenses/gpl.html>
40
41 <http://www.sisudoc.org/sisu/en/manifest/gpl.fsf.html>
42
43 ** SiSU uses:
44 * Standard SiSU markup syntax,
45 * Standard SiSU meta-markup syntax, and the
46 * Standard SiSU object citation numbering and system
47
48 ** Hompages:
49 <http://www.jus.uio.no/sisu>
50 <http://www.sisudoc.org>
51
52 ** Git
53 <http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=summary>
54 <http://git.sisudoc.org/gitweb/?p=code/sisu.git;a=blob;f=lib/sisu/develop/db_sqltxt.rb;hb=HEAD>
55
56 =end
57 module SiSU_DbText
58 class Prepare
59 def special_character_escape(str)
60 str=str.gsub(/'/m,"''"). #string.gsub!(/'/,"\047") #string.gsub!(/'/,"\\'")
61 gsub(/(\\)/m,'\1\1'). #ok but with warnings, double backslash on sqlite #str.gsub!(/[\\]/m,'\\x5C') #ok but with warnings, but not for sqlite #str.gsub!(/(\\)/m,'\1') #ok for sqlite not for pgsql
62 gsub(/#{Mx[:br_line]}|#{Mx[:br_nl]}/m,"<br>\n").
63 gsub(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/m,''). #check
64 gsub(/#{Mx[:lnk_o]}\s*(\S+?\.(?:png|jpg))(?:\s+\d+x\d+)?(.+?)#{Mx[:lnk_c]}\S+/m,'[image: \1] \2').
65 gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}(?:file|ftp):\/\/\S+?([.,!?]?(?:\s|$))/m,'\1\2').
66 gsub(/#{Mx[:lnk_o]}\s*(.+?)\s*#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/m,'\1')
67 end
68 def clean_searchable_text_from_document_objects(arr)
69 txt_arr,en=[],[]
70 arr=(arr.is_a?(String)) ? [ arr ] : arr
71 arr.each do |s|
72 s=s.gsub(/#{Mx[:fa_o]}[a-z]{1,4}#{Mx[:fa_o_c]}/m,'').
73 gsub(/#{Mx[:fa_c_o]}[a-z]{1,4}#{Mx[:fa_c]}/m,'').
74 gsub(/<br>/m,' ')
75 en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m)
76 s=s.gsub(/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/m,'').
77 gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,'').
78 gsub(/ \s+/m,' ')
79 #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/
80 txt_arr << s
81 end
82 txt_arr=txt_arr << en
83 txt=txt_arr.flatten.join("\n")
84 special_character_escape(txt)
85 end
86 def clean_document_objects_body(arr)
87 txt_arr,en,en_arr=[],[],[]
88 arr=(arr.is_a?(String)) ? [ arr ] : arr
89 arr.each do |s|
90 en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m)
91 s=s.
92 gsub(/#{Mx[:en_a_o]}\s*(\d+).+?#{Mx[:en_a_c]}/m,
93 '<sup>\1</sup>').
94 gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,'').
95 gsub(/ \s+/m,' ')
96 txt_arr << s
97 end
98 en.flatten.each do |e|
99 e=e.sub(/^(\d+)\s*/,'<sup>\1</sup> ')
100 en_arr << e
101 end
102 txt_arr=txt_arr << en_arr
103 txt=txt_arr.flatten.join("\n<br>")
104 special_character_escape(txt)
105 end
106 def clean_searchable_text_from_document_source(arr)
107 txt_arr,en=[],[]
108 arr=(arr.is_a?(String)) ? arr.split(/\n+/m) : arr
109 arr.each do |s|
110 s=s.gsub(/([*\/_-])\{(.+?)\}\1/m,'\2').
111 gsub(/^(?:block|group|poem|code)\{/m,'').
112 gsub(/^\}(?:block|group|poem|code)/m,'').
113 gsub(/\A(?:@\S+:\s+.+)\Z/m,'')
114 if s =~/^:A~/
115 if defined? @md.creator \
116 and defined? @md.creator.author \
117 and not @md.creator.author.empty?
118 s=s.gsub(/@author/,@md.creator.author)
119 else
120 SiSU_Screen::Ansi.new(
121 'v',
122 'WARNING Document Author information missing; provide @creator: :author:',
123 @md.fnb
124 ).warn unless @md.opt.act[:quiet][:set]==:on
125 end
126 if defined? @md.title \
127 and defined? @md.title.full \
128 and not @md.title.full.empty?
129 s=s.gsub(/@title/,@md.title.full)
130 else
131 SiSU_Screen::Ansi.new(
132 'v',
133 'WARNING Document Title missing; provide @title:',
134 @md.fnb
135 ).warn unless @md.opt.act[:quiet][:set]==:on
136 end
137 end
138 s=s.gsub(/^(?:_[1-9]\*?|_\*)\s+/m,'').
139 gsub(/^(?:[1-9]\~(\S+)?)\s+/m,'').
140 gsub(/^(?::?[A-C]\~(\S+)?)\s+/m,'').
141 gsub(/^%{1,3} .+/m,''). #removed even if contained in code block
142 gsub(/<br>/m,' ')
143 #en << s.scan(/~\{\s*(.+?)\s*\}~/m)
144 s=s.gsub(/~\{.+?\}~/m,'').
145 gsub(/ \s+/m,' ')
146 ##special_character_escape(s)
147 #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/
148 s
149 end
150 txt_arr << arr << en
151 txt=txt_arr.flatten.join("\n")
152 txt=special_character_escape(txt)
153 txt
154 end
155 def strip_markup(str) #define rules, make same as in dal clean
156 str=str.gsub(/#{Mx[:fa_superscript_o]}(\d+)#{Mx[:fa_superscript_c]}/,'[\1]').
157 gsub(/(?:&nbsp\\;|#{Mx[:nbsp]})+/,' ').
158 gsub(/#{Mx[:tc_o]}#{Mx[:tc_p]}#{Mx[:tc_p]}\d+(.+)#{Mx[:tc_c]}/u,'\1'). #tables
159 gsub(/#{Mx[:tc_p]}#{Mx[:tc_p]}\d+#{Mx[:tc_p]}/u,' '). #tables
160 gsub(/#{Mx[:tc_p]}/u,' '). #tables tidy later
161 gsub(/<.+?>/,'').
162 gsub(/#{Mx[:lnk_o]}.+?\.(?:png|jpg|gif).+?#{Mx[:lnk_c]}(?:file|ftp)\/\/:\S+ /,' [image] '). # else image names found in search
163 gsub(/#{Mx[:lnk_o]}.+?\.(?:png|jpg|gif).+?#{Mx[:lnk_c]}#{Mx[:url_o]}\S+?#{Mx[:url_c]}/,' [image]'). # else image names found in search
164 gsub(/\s\s+/,' ').
165 strip
166 end
167 def unique_words(str)
168 a=str.scan(/[a-zA-Z0-9\\\/_-]{2,}/) #a=str.scan(/\S+{2,}/)
169 str=a.uniq.sort.join(' ')
170 str
171 end
172 end
173 end
174 __END__