# coding: utf-8
=begin
* Name: SiSU
* Description: a framework for document structuring, publishing and search
#___#
* Author: Ralph Amissah
* Copyright: (C) 1997 - 2011, Ralph Amissah, All Rights Reserved.
* License: GPL 3 or later:
SiSU, a framework for document structuring, publishing and search
Copyright (C) Ralph Amissah
This program is free software: you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation, either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along with
this program. If not, see .
If you have Internet connection, the latest version of the GPL should be
available at these locations:
* SiSU uses:
* Standard SiSU markup syntax,
* Standard SiSU meta-markup syntax, and the
* Standard SiSU object citation numbering and system
* Hompages:
* Download:
* Ralph Amissah
** Description: system environment, resource control and configuration details
=end
module SiSU_text_representation
class Alter
def initialize(x)
if x.class==String
@t_o,@s=nil,x
else
@t_o,@s=x,x.obj.dup
end
end
def strip_clean_of_extra_spaces # dal output tuned
@s=@s.dup
@s=@s.gsub(/[ ]+([,.;:?](?:$|\s))/,'\1') unless @s =~/#{Mx[:en_a_o]}|#{Mx[:en_b_o]}/
@s=@s.gsub(/ [ ]+/,' ')
@s=@s.gsub(/^ [ ]+/,'')
@s=@s.gsub(/ [ ]+$/,'')
@s=@s.gsub(/((?:#{Mx[:fa_bold_c]}|#{Mx[:fa_italics_c]})')[ ]+(s )/,'\1\2')
@s=@s.gsub(/((?:#{Mx[:fa_bold_c]}|#{Mx[:fa_italics_c]})')[ ]+(s )/,'\1\2')
end
def strip_clean_of_markup # text form used in sql db search, used for digest, define rules, make same as in db clean
@s=@s.dup #% same as db clean -->
@s=@s.gsub(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1')
@s=@s.gsub(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1')
@s=@s.gsub(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1')
@s=@s.gsub(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'\1')
@s=@s.gsub(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'\1')
@s=@s.gsub(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strke_c]}/,'\1')
@s=@s.gsub(/#{Mx[:fa_superscript_o]}(\d+)#{Mx[:fa_superscript_c]}/,'[\1]')
@s=@s.gsub(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1')
@s=@s.gsub(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1')
@s=@s.gsub(/#{Mx[:fa_hilite_o]}(.+?)#{Mx[:fa_hilite_c]}/,'\1')
@s=@s.gsub(/#{Mx[:gl_o]}#(?:126|152)#{Mx[:gl_c]}/i,'~')
@s=@s.gsub(/#{Mx[:en_a_o]}([\d*+]+)\s+(?:.+?)#{Mx[:en_a_c]}/,'') # endnote removed
@s=@s.gsub(/#{Mx[:en_b_o]}([\d*+]+)\s+(?:.+?)#{Mx[:en_b_c]}/,'') # endnote removed
@s=@s.gsub(/(?:#{Mx[:nbsp]})+/,' ')
@s=@s.gsub(/(?:#{Mx[:br_nl]})+/,"\n")
@s=@s.gsub(/(?:#{Mx[:br_paragraph]})+/,"\n")
@s=@s.gsub(/(?:#{Mx[:br_line]})+/,"\n")
@s=@s.gsub(/#{Mx[:gl_o]}(?:#lt|#060)#{Mx[:gl_c]}/,'<')
@s=@s.gsub(/#{Mx[:gl_o]}(?:#gt|#062)#{Mx[:gl_c]}/,'>')
@s=@s.gsub(/#{Mx[:gl_o]}#(?:038|amp)#{Mx[:gl_c]}/,'&')
@s=@s.gsub(/#{Mx[:gl_o]}#033#{Mx[:gl_c]}/,'!')
@s=@s.gsub(/#{Mx[:gl_o]}#035#{Mx[:gl_c]}/,'#')
@s=@s.gsub(/#{Mx[:gl_o]}#042#{Mx[:gl_c]}/,'*')
@s=@s.gsub(/#{Mx[:gl_o]}#045#{Mx[:gl_c]}/,'-')
@s=@s.gsub(/#{Mx[:gl_o]}#047#{Mx[:gl_c]}/,'/')
@s=@s.gsub(/#{Mx[:gl_o]}#095#{Mx[:gl_c]}/,'_')
@s=@s.gsub(/#{Mx[:gl_o]}#123#{Mx[:gl_c]}/,'{')
@s=@s.gsub(/#{Mx[:gl_o]}#125#{Mx[:gl_c]}/,'}')
@s=@s.gsub(/#{Mx[:gl_o]}#126#{Mx[:gl_c]}/,'~')
@s=@s.gsub(/#{Mx[:gl_o]}#169#{Mx[:gl_c]}/,'©')
@s=@s.gsub(/\s\s+/,' ')
@s=@s.gsub(/\s\s+/,' ')
@s=@s.strip
end
def semi_revert_markup # used for digest, define rules, make same as in db clean
if @t_o
@s=@s.gsub(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'*{\1}*')
@s=@s.gsub(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'/{\1}/')
@s=@s.gsub(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'_{\1}_')
@s=@s.gsub(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'"{\1}"')
@s=@s.gsub(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'+{\1}+')
@s=@s.gsub(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strke_c]}/,'-{\1}-')
@s=@s.gsub(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'^{\1}^')
@s=@s.gsub(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,',{\1},')
@s=@s.gsub(/#{Mx[:gl_o]}#(?:126|152)#{Mx[:gl_c]}/i,'~')
@s=@s.gsub(/#{Mx[:en_a_o]}([\d*+]+\s+.+?)#{Mx[:en_a_c]}/,'~{\1}~') # endnote marker marked up
@s=@s.gsub(/#{Mx[:en_b_o]}([\d*+]+\s+.+?)#{Mx[:en_b_c]}/,'~[\1]~') # endnote marker marked up
if @t_o.is=='heading' or @t_o.is=='para'
@s=@s.gsub(/ [ ]+/,' ')
@s=@s.gsub(/(?:#{Mx[:nbsp]})+/,' ')
if @t_o.is=='heading'
@s=@t_o.lv + '~ ' + @s
end
if @t_o.is=='para'
if @t_o.bullet_
@s='_* ' + @s
end
if @t_o.indent.to_i > 0
@s="_#{@t_o.indent} " + @s
@s=@s.gsub(/^(_[1-9])\s_\*\s/,'\1* ')
end
end
end
if @t_o.is=='block' \
or @t_o.is=='group' \
or @t_o.is=='code'
@s=@s.gsub(/#{Mx[:nbsp]}/,' ')
@s="#{@t_o.is}{\n\n#{@s}\n\n}#{@t_o.is}"
@s=@s.gsub(/(?:#{Mx[:br_nl]}|\n)+/m,"\n\n")
end
#dealing with poem and verse calls for change in dal, where start and end verse of poem are marked as such
@s=@s.strip
end
@s
end
def html_lite #test whether eventually can be used in db_import replacing shared_html_lite (search for SiSU_Format_Shared)
if @t_o
@s=@s.gsub(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1')
@s=@s.gsub(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1')
@s=@s.gsub(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1')
@s=@s.gsub(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'"\1"')
@s=@s.gsub(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'+{\1}+')
@s=@s.gsub(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strke_c]}/,'-{\1}-')
@s=@s.gsub(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1')
@s=@s.gsub(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1')
@s=@s.gsub(/#{Mx[:gl_o]}#(?:126|152)#{Mx[:gl_c]}/i,'~')
if @t_o.is !='code'
if @s =~/#{Mx[:lnk_o]}.+?#{Mx[:lnk_c]}(?:#{Mx[:url_o]}\S+?#{Mx[:url_c]}|image)/
wm=@s.scan(/#{Mx[:lnk_o]}.+?#{Mx[:lnk_c]}(?:#{Mx[:url_o]}\S+?#{Mx[:url_c]}|image)|\S+/)
words=urls(wm)
@s=@s.gsub(/.+/m,words)
end
@s=@s.gsub(/#{Mx[:gl_o]}(#[0-9]{3})#{Mx[:gl_c]}/u,'&\1;')
@s=@s.gsub(/#{Mx[:gl_o]}#([a-z]{2,4})#{Mx[:gl_c]}/u,'&\1;')
@s=@s.gsub(/#{Mx[:url_o]}_(\S+?)#{Mx[:url_c]}/,'\1') #http ftp matches escaped, no decoration
@s=@s.gsub(/(#{Mx[:lnk_c]})#{Mx[:url_o]}(\S+?)#{Mx[:url_c]}/,'\1\2\3') #special case \{ e.g. \}http://url
@s=@s.gsub(/#{Mx[:url_o]}(\S+?)#{Mx[:url_c]}/,%{#{@url_brace.xml_open}\\1#{@url_brace.xml_close}}) #http ftp matches with decoration
else
@s=@s.gsub(/(^|[^}])_/m,'\1>') #code-block: angle brackets special characters
@s=@s.gsub(/(^|[^}])_/m,'\1>')
end
if @t_o.is=='paragraph'
if @t_o.bullet_
@s=@s
end
if @t_o.indent > 0
@s=@s
end
end
if @t_o.is=='heading'
@s=@s
end
else
p __FILE__ +':'+ __LINE__.to_s
end
@s
end
end
class Modified_text_plus_Hash_digest
def initialize(md,x)
@md=md
if x.class==String
@t_o,@s=nil,x
else
@t_o,@s=x,x.obj.dup
end
@env ||=SiSU_Env::Info_env.new(@md.fns)
@sha_ =((@env.digest.type =='sha256') ? true : false)
@sha_ ? (require 'digest/sha2') : (require 'digest/md5')
end
def digest(txt)
d=nil
if @sha_
for hash_class in [ Digest::SHA256 ]
d=hash_class.hexdigest(txt)
end
else
for hash_class in [ Digest::MD5 ]
d=hash_class.hexdigest(txt)
end
end
d
end
def strip_clean_of_markup
def txt
SiSU_text_representation::Alter.new(@s).strip_clean_of_markup
end
def dgst
en_dgst,img_dgst={},{}
txt_dgst=digest(txt)
{:txt=>txt,:dgst_txt=>txt_dgst}
end
self
end
def semi_revert_markup
def txt
SiSU_text_representation::Alter.new(@s).semi_revert_markup
end
def dgst
txt_dgst=digest(txt)
{:txt=>txt,:dgst_txt=>txt_dgst}
end
self
end
def composite
def stripped_clean(txt)
SiSU_text_representation::Alter.new(txt).strip_clean_of_markup
end
def markup_reverted(txt)
SiSU_text_representation::Alter.new(txt).semi_revert_markup
end
def images(imgs)
sys=SiSU_Env::System_call.new
line_image=[]
img_dgst={}
if imgs and imgs.length > 0
@image_name,@image_dgst,@img=[],[],[]
imgs.each do |i|
image_source=if FileTest.file?("#{@env.path.image_source_include_local}/#{i}")
@env.path.image_source_include_local
elsif FileTest.file?("#{@env.path.image_source_include_remote}/#{i}")
@env.path.image_source_include_remote
elsif FileTest.file?("#{@env.path.image_source_include}/#{i}")
@env.path.image_source_include
else
SiSU_Screen::Ansi.new(@md.cmd,"ERROR - image:", %{"#{i}" missing}, "search locations: #{@env.path.image_source_include_local}, #{@env.path.image_source_include_remote} and #{@env.path.image_source_include}").error2 unless @md.cmd =~/q/
nil
end
img_type = /\S+\.(png|jpg|gif)/.match(i)[1]
not_found_msg='image not found'
if image_source
para_image = image_source + '/' + i
image_name = i
image_dgst =(@sha_ ? sys.sha256(para_image) : sys.md5(para_image))
else
image_name = i + ' [image missing]'
image_dgst = ''
end
line_image << {:img_dgst=>image_dgst[1],:img_name=>image_name,:img_type=>img_type}
end
end
line_image
end
def endnotes(en)
en_dgst=[]
if en and en.length > 0
en.flatten.each do |e|
note_no=e.gsub(/^([\d*+]+)\s+.+/,'\1')
e=digest(stripped_clean(e))
note_dgst=digest(e)
en_dgst << {:note_number=>note_no,:note_dgst=>note_dgst}
end
end
en_dgst
end
def dgst
if @t_o.of !='comment' and @t_o.of !='structure' and @t_o.of !='layout'
en_dgst,img_dgst={},{}
txt_stripped_dgst=digest(stripped_clean(@t_o))
txt_markup_reverted_dgst=digest(markup_reverted(@t_o))
endnotes_dgst=[]
rgx_notes=/(?:#{Mx[:en_a_o]}|#{Mx[:en_b_o]})([\d*+]+\s+.+?)(?:#{Mx[:en_a_c]}|#{Mx[:en_b_c]})/
notes=@t_o.obj.scan(rgx_notes)
endnotes_dgst=endnotes(notes)
rgx_image=/#{Mx[:lnk_o]}(\S+\.(?:png|jpg|gif))\s.+?#{Mx[:lnk_c]}(?:#{Mx[:url_o]}\S+?#{Mx[:url_c]}|image)/
imgs=if (@t_o.is=='para' or @t_o.is=='image') \
and @t_o.obj =~rgx_image
imgs=@t_o.obj.scan(rgx_image).flatten
line_image=images(imgs)
end
dgst={:is=>@t_o.is,:ocn=>@t_o.ocn,:dgst_stripped_txt=>txt_stripped_dgst,:dgst_markedup_txt=>txt_markup_reverted_dgst}
dgst[:endnotes]=endnotes_dgst if endnotes_dgst and endnotes_dgst.length > 0
dgst[:images]=line_image if line_image and line_image.length > 0
end
dgst
end
self
end
end
end
__END__