From f6d28b62f0e02b8a88a1832589e203c7a613f45b Mon Sep 17 00:00:00 2001 From: Ralph Amissah Date: Fri, 25 Nov 2022 22:06:40 -0500 Subject: regex review, match speed & compile time, ctregex - improve match time - add interim fontface identifier marker - improve compile time - remove unused regexs - separate out some specialized output matches --- org/default_regex.org | 198 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 140 insertions(+), 58 deletions(-) (limited to 'org/default_regex.org') diff --git a/org/default_regex.org b/org/default_regex.org index 89d6ea3..976baa0 100644 --- a/org/default_regex.org +++ b/org/default_regex.org @@ -67,7 +67,6 @@ static template spineRgxIn() { <> <> <> - <> <> <> <> @@ -86,10 +85,6 @@ static template spineRgxIn() { /+ misc +/ static flag_action = ctRegex!(`^(--[a-z][a-z0-9-]+)$`); static within_quotes = ctRegex!(`"(.+?)"`, "m"); -static yaml_tag_is_str = ctRegex!(`:str$`); -static yaml_tag_is_int = ctRegex!(`:int$`); -static yaml_tag_is_map = ctRegex!(`:map$`); -static yaml_tag_is_seq = ctRegex!(`:seq$`); static make_heading_delimiter = ctRegex!(`[;][ ]*`); static arr_delimiter = ctRegex!(`[ ]*[;][ ]*`); static name_delimiter = ctRegex!(`^([^,]+)[ ]*,[ ]+(.+?)$`); @@ -476,8 +471,6 @@ static template spineRgxOut() { <> <> <> - <> - <> <> } } @@ -492,35 +485,22 @@ static make_breakpage = ctRegex!(`new=(?P.+ static make_breakcolumn = ctRegex!(`break=(?P.+?)(?:;|$)`,); #+END_SRC -** special characters -*** xhtml special characters +* 2. ctRegex defaults shared by meta & output (generic) -#+NAME: sp_ch_xhtml_rgx +** meta + +#+NAME: prgmkup_rgx_meta #+BEGIN_SRC d -static xhtml_ampersand = ctRegex!(`[&]`, "m"); // & -static xhtml_quotation = ctRegex!(`["]`, "m"); // " -static xhtml_less_than = ctRegex!(`[<]`, "m"); // < -static xhtml_greater_than = ctRegex!(`[>]`, "m"); // > -static xhtml_line_break = ctRegex!(` [\\]{2}`, "m"); //
+static space = ctRegex!(`[ ]`, "mg"); +static spaces_keep = ctRegex!(`(?P^[ ]+|[ ]{2,})`, "mg"); // code, verse, block #+END_SRC -*** latex special characters +** spine & source_in -#+NAME: sp_ch_latex_rgx +#+NAME: prgmkup_rgx_in #+BEGIN_SRC d -static latex_special_char = ctRegex!(`([%${}_#&\\])`); -static latex_special_char_for_escape = ctRegex!(`([%${}_#\\])`); -static latex_special_char_for_escape_and_braces = ctRegex!(`([&])`); -static latex_special_char_for_escape_url = ctRegex!(`([%])`); -static latex_special_char_escaped = ctRegex!(`\\([%${}_#\\])`); -static latex_special_char_escaped_braced = ctRegex!(`[{]\\([&])[}]`); -static latex_identify_inline_link = ctRegex!(`┥.+?┝┤\S+?├`, "mg"); -static latex_identify_inline_fontface = ctRegex!(`\\([_#$]┨.+?┣)\\([_#$])`, "mg"); -static latex_clean_internal_link = ctRegex!(`^(?:#|¤\S+?#)`, "m"); -static latex_clean_bookindex_linebreak = ctRegex!(`\s*\\\\\\\\\s*`, "m"); #+END_SRC -* 2. ctRegex defaults shared by meta & output (generic) ** misc generic #+NAME: prgmkup_rgx_spaces @@ -534,24 +514,6 @@ static nbsp_chars = ctRegex!(`[░]+`, "mg"); static middle_dot = ctRegex!(`·`, "mg"); #+END_SRC -** filename (& path) (including insert file) :insert:file:path:filename: - -#+NAME: prgmkup_rgx_filename_and_path -#+BEGIN_SRC d -static src_pth_sst_or_ssm = ctRegex!(`^(?P[/]?(?:[a-zA-Z0-9._-]+/)*)(?P[a-zA-Z0-9._-]+[.](?Pss[tm]))$`); -static src_pth_pod_sst_or_ssm = ctRegex!(`^(?P[/]?(?:[a-zA-Z0-9._-]+/)*)media/text/[a-z]{2}/(?P[a-zA-Z0-9][a-zA-Z0-9._-]*?[.]ss[tm])$`); -static src_pth_contents = ctRegex!(`^(?P[/]?(?:[a-zA-Z0-9._-]+/)*)(?P[a-zA-Z0-9][a-zA-Z0-9._-]*)/pod[.]manifest$`); -static src_pth_zip = ctRegex!(`^(?P[/]?(?:[a-zA-Z0-9._-]+/)*)(?P[a-zA-Z0-9._-]+[.]zip)$`); -static src_pth_types = ctRegex!(`^(?P[/]?[a-zA-Z0-9._-]+/)*(?P(?P[a-zA-Z0-9._-]+[.]ss[tm])|(?P[a-zA-Z0-9._-]+/pod[.]manifest)|(?P[a-zA-Z0-9._-]+[.]zip))$`); -static src_fn = - ctRegex!(`^([/]?(?:[a-zA-Z0-9._-]+/)*)(?P(?P[a-zA-Z0-9._-]+)[.](?Pss[tm]))$`); -static src_fn_master = ctRegex!(`^(?P/?(?:[a-zA-Z0-9._-]+/)*)(?P[a-zA-Z0-9._-]+[.]ssm)$`); -static src_fn_find_inserts = ctRegex!(`^(?P/?(?:[a-zA-Z0-9._-]+/)*)(?P[a-zA-Z0-9._-]+[.]ss[im])$`); -static insert_src_fn_ssi_or_sst = ctRegex!(`^<<\s*(?P[a-zA-Z0-9._-]+/)*(?P[a-zA-Z0-9._-]+[.]ss[ti])$`); -static src_base_parent_dir_name = ctRegex!(`[/](?P(?:[a-zA-Z0-9._-]+))(?:/media/text/[a-z]{2})$`); // formalizes dir structure -static src_formalised_file_path_parts = ctRegex!(`(?P(?:[/a-zA-Z0-9._-]+?)(?P[a-zA-Z0-9._-]+))(?:/media/text/[a-z]{2})$`); // formalizes dir structure -#+END_SRC - ** inline markup *** inline breaks @@ -666,21 +628,21 @@ static quotation_mark_sql_insert_delimiter = ctRegex!("[']", "mg"); #+NAME: prgmkup_rgx_inline_font_face #+BEGIN_SRC d /+ inline markup font face mod +/ -static inline_emphasis = ctRegex!(`[*]┨(?P.+?)┣[*]`, "mg"); -static inline_bold = ctRegex!(`[!]┨(?P.+?)┣[!]`, "mg"); -static inline_underscore = ctRegex!(`[_]┨(?P.+?)┣[_]`, "mg"); -static inline_italics = ctRegex!(`[/]┨(?P.+?)┣[/]`, "mg"); -static inline_superscript = ctRegex!(`\^┨(?P.+?)┣\^`, "mg"); -static inline_subscript = ctRegex!(`[,]┨(?P.+?)┣[,]`, "mg"); -static inline_strike = ctRegex!(`[-]┨(?P.+?)┣[-]`, "mg"); -static inline_insert = ctRegex!(`[+]┨(?P.+?)┣[+]`, "mg"); -static inline_mono = ctRegex!(`[■]┨(?P.+?)┣[■]`, "mg"); -static inline_cite = ctRegex!(`[‖]┨(?P.+?)┣[‖]`, "mg"); +static inline_emphasis = ctRegex!(`⑆[*]┨(?P.+?)┣[*]`, "mg"); +static inline_bold = ctRegex!(`⑆[!]┨(?P.+?)┣[!]`, "mg"); +static inline_underscore = ctRegex!(`⑆[_]┨(?P.+?)┣[_]`, "mg"); +static inline_italics = ctRegex!(`⑆[/]┨(?P.+?)┣[/]`, "mg"); +static inline_superscript = ctRegex!(`⑆\^┨(?P.+?)┣\^`, "mg"); +static inline_subscript = ctRegex!(`⑆[,]┨(?P.+?)┣[,]`, "mg"); +static inline_strike = ctRegex!(`⑆[-]┨(?P.+?)┣[-]`, "mg"); +static inline_insert = ctRegex!(`⑆[+]┨(?P.+?)┣[+]`, "mg"); +static inline_mono = ctRegex!(`⑆[■]┨(?P.+?)┣[■]`, "mg"); +static inline_cite = ctRegex!(`⑆[‖]┨(?P.+?)┣[‖]`, "mg"); #+END_SRC #+BEGIN_SRC d -// static inline_superscript = ctRegex!(`[\^]┨(?P.+?)┣[\^]`, "mg"); -// static inline_fontface_clean = ctRegex!(`[*!_/^,+■‖-]┨|┣[*!_/^,+■‖-]`, "mg"); +// static inline_superscript = ctRegex!(`⑆[\^]┨(?P.+?)┣[\^]`, "mg"); +// static inline_fontface_clean = ctRegex!(`⑆[*!_/^,+■‖-]┨|┣[*!_/^,+■‖-]`, "mg"); #+END_SRC *** table related @@ -692,6 +654,126 @@ static table_delimiter_col = ctRegex!("[ ]*[┊][ ]*", "mg" static table_delimiter_row = ctRegex!("[ ]*\n", "mg"); #+END_SRC +** files filename (& path) (including insert file) :insert:file:path:filename: + +#+HEADER: :tangle "../src/doc_reform/meta/rgx_files.d" +#+HEADER: :noweb yes +#+BEGIN_SRC d +<> +/++ + regex: regular expressions used in sisu document parser ++/ +module doc_reform.meta.rgx_files; +static template spineRgxFiles() { + static struct RgxFiles { + <> + <> + } +} +#+END_SRC + +#+NAME: prgmkup_rgx_filename_and_path +#+BEGIN_SRC d +static src_pth_sst_or_ssm = ctRegex!(`^(?P[/]?(?:[a-zA-Z0-9._-]+/)*)(?P[a-zA-Z0-9._-]+[.](?Pss[tm]))$`); +static src_pth_pod_sst_or_ssm = ctRegex!(`^(?P[/]?(?:[a-zA-Z0-9._-]+/)*)media/text/[a-z]{2}/(?P[a-zA-Z0-9][a-zA-Z0-9._-]*?[.]ss[tm])$`); +static src_pth_contents = ctRegex!(`^(?P[/]?(?:[a-zA-Z0-9._-]+/)*)(?P[a-zA-Z0-9][a-zA-Z0-9._-]*)/pod[.]manifest$`); +static src_pth_zip = ctRegex!(`^(?P[/]?(?:[a-zA-Z0-9._-]+/)*)(?P[a-zA-Z0-9._-]+[.]zip)$`); +static src_pth_types = ctRegex!(`^(?P[/]?[a-zA-Z0-9._-]+/)*(?P(?P[a-zA-Z0-9._-]+[.]ss[tm])|(?P[a-zA-Z0-9._-]+/pod[.]manifest)|(?P[a-zA-Z0-9._-]+[.]zip))$`); +static src_fn = ctRegex!(`^([/]?(?:[a-zA-Z0-9._-]+/)*)(?P(?P[a-zA-Z0-9._-]+)[.](?Pss[tm]))$`); +static src_fn_master = ctRegex!(`^(?P/?(?:[a-zA-Z0-9._-]+/)*)(?P[a-zA-Z0-9._-]+[.]ssm)$`); +static src_fn_find_inserts = ctRegex!(`^(?P/?(?:[a-zA-Z0-9._-]+/)*)(?P[a-zA-Z0-9._-]+[.]ss[im])$`); +static insert_src_fn_ssi_or_sst = ctRegex!(`^<<\s*(?P[a-zA-Z0-9._-]+/)*(?P[a-zA-Z0-9._-]+[.]ss[ti])$`); +static src_base_parent_dir_name = ctRegex!(`[/](?P(?:[a-zA-Z0-9._-]+))(?:/media/text/[a-z]{2})$`); // formalizes dir structure +static src_formalised_file_path_parts = ctRegex!(`(?P(?:[/a-zA-Z0-9._-]+?)(?P[a-zA-Z0-9._-]+))(?:/media/text/[a-z]{2})$`); // formalizes dir structure +#+END_SRC + +** _module template yaml tags + +#+HEADER: :tangle "../src/doc_reform/meta/rgx_yaml_tags.d" +#+HEADER: :noweb yes +#+BEGIN_SRC d +<> +/++ + regex: regular expressions used in sisu document parser ++/ +module doc_reform.meta.rgx_yaml; +static template spineRgxYamlTags() { + static struct RgxYaml { + <> + } +} +#+END_SRC + +#+NAME: meta_rgx_yaml +#+BEGIN_SRC d +static yaml_tag_is_str = ctRegex!(`:str$`); +static yaml_tag_is_int = ctRegex!(`:int$`); +static yaml_tag_is_map = ctRegex!(`:map$`); +static yaml_tag_is_seq = ctRegex!(`:seq$`); +#+END_SRC + +** special characters +*** xhtml special characters template + +#+HEADER: :tangle "../src/doc_reform/io_out/rgx_xhtml.d" +#+HEADER: :noweb yes +#+BEGIN_SRC d +<> +/++ + regex: regular expressions used in sisu document parser ++/ +module doc_reform.io_out.rgx_xhtml; +static template spineRgxXHTML() { + static struct RgxXHTML { + <> + } +} +#+END_SRC + +*** xhtml special characters + +#+NAME: sp_ch_xhtml_rgx +#+BEGIN_SRC d +static ampersand = ctRegex!(`[&]`, "m"); // & +static quotation = ctRegex!(`["]`, "m"); // " +static less_than = ctRegex!(`[<]`, "m"); // < +static greater_than = ctRegex!(`[>]`, "m"); // > +static line_break = ctRegex!(` [\\]{2}`, "m"); //
+#+END_SRC + +*** LaTeX special characters template + +#+HEADER: :tangle "../src/doc_reform/io_out/rgx_latex.d" +#+HEADER: :noweb yes +#+BEGIN_SRC d +<> +/++ + regex: regular expressions used in sisu document parser ++/ +module doc_reform.io_out.rgx_latex; +static template spineRgxLSC() { + static struct RgxLSC { + <> + } +} +#+END_SRC + +*** latex special characters + +#+NAME: sp_ch_latex_rgx +#+BEGIN_SRC d +static latex_special_char = ctRegex!(`([%${}_#&\\])`); +static latex_special_char_for_escape = ctRegex!(`([%${}_#\\])`); +static latex_special_char_for_escape_and_braces = ctRegex!(`([&])`); +static latex_special_char_for_escape_url = ctRegex!(`([%])`); +static latex_special_char_escaped = ctRegex!(`\\([%${}_#\\])`); +static latex_special_char_escaped_braced = ctRegex!(`[{]\\([&])[}]`); +static latex_identify_inline_link = ctRegex!(`┥.+?┝┤\S+?├`, "mg"); +static latex_identify_inline_fontface = ctRegex!(`\\([_#$]┨.+?┣)\\([_#$])`, "mg"); +static latex_clean_internal_link = ctRegex!(`^(?:#|¤\S+?#)`, "m"); +static latex_clean_bookindex_linebreak = ctRegex!(`\s*\\\\\\\\\s*`, "m"); +#+END_SRC + * document header including copyright & license #+NAME: doc_header_including_copyright_and_license -- cgit v1.2.3