diff options
| author | Ralph Amissah <ralph.amissah@gmail.com> | 2022-11-25 22:06:40 -0500 | 
|---|---|---|
| committer | Ralph Amissah <ralph.amissah@gmail.com> | 2022-12-23 18:17:41 -0500 | 
| commit | f6d28b62f0e02b8a88a1832589e203c7a613f45b (patch) | |
| tree | b5d6462e45bae998190194784e02b143a83f79a3 /org/default_regex.org | |
| parent | gitignore & things nix (diff) | |
regex review, match speed & compile time, ctregex
- improve match time
  - add interim fontface identifier marker
- improve compile time
  - remove unused regexs
  - separate out some specialized output matches
Diffstat (limited to 'org/default_regex.org')
| -rw-r--r-- | org/default_regex.org | 198 | 
1 files changed, 140 insertions, 58 deletions
diff --git a/org/default_regex.org b/org/default_regex.org index 89d6ea3..976baa0 100644 --- a/org/default_regex.org +++ b/org/default_regex.org @@ -67,7 +67,6 @@ static template spineRgxIn() {      <<meta_rgx_bibliography>>      <<meta_rgx_book_index_split>>      <<meta_rgx_topic_register_split>> -    <<meta_rgx_language_codes>>      <<prgmkup_rgx_spaces>>      <<prgmkup_rgx_filename_and_path>>      <<prgmkup_rgx_inline_breaks>> @@ -86,10 +85,6 @@ static template spineRgxIn() {  /+ misc +/  static flag_action                                    = ctRegex!(`^(--[a-z][a-z0-9-]+)$`);  static within_quotes                                  = ctRegex!(`"(.+?)"`, "m"); -static yaml_tag_is_str                                = ctRegex!(`:str$`); -static yaml_tag_is_int                                = ctRegex!(`:int$`); -static yaml_tag_is_map                                = ctRegex!(`:map$`); -static yaml_tag_is_seq                                = ctRegex!(`:seq$`);  static make_heading_delimiter                         = ctRegex!(`[;][ ]*`);  static arr_delimiter                                  = ctRegex!(`[ ]*[;][ ]*`);  static name_delimiter                                 = ctRegex!(`^([^,]+)[ ]*,[ ]+(.+?)$`); @@ -476,8 +471,6 @@ static template spineRgxOut() {      <<prgmkup_rgx_inline_links>>      <<prgmkup_rgx_inline_font_face>>      <<prgmkup_rgx_table>> -    <<sp_ch_xhtml_rgx>> -    <<sp_ch_latex_rgx>>      <<grouped_text_rgx_paragraph_marks>>    }  } @@ -492,35 +485,22 @@ static make_breakpage                           = ctRegex!(`new=(?P<breakpage>.+  static make_breakcolumn                         = ctRegex!(`break=(?P<breakcolumn>.+?)(?:;|$)`,);  #+END_SRC -** special characters -*** xhtml special characters +* 2. ctRegex defaults shared by meta & output (generic) -#+NAME: sp_ch_xhtml_rgx +** meta + +#+NAME: prgmkup_rgx_meta  #+BEGIN_SRC d -static xhtml_ampersand                          = ctRegex!(`[&]`, "m");      // & -static xhtml_quotation                          = ctRegex!(`["]`, "m");      // " -static xhtml_less_than                          = ctRegex!(`[<]`, "m");      // < -static xhtml_greater_than                       = ctRegex!(`[>]`, "m");      // > -static xhtml_line_break                         = ctRegex!(` [\\]{2}`, "m"); // <br /> +static space                                    = ctRegex!(`[ ]`, "mg"); +static spaces_keep                              = ctRegex!(`(?P<keep_spaces>^[ ]+|[ ]{2,})`, "mg"); // code, verse, block  #+END_SRC -*** latex special characters +** spine & source_in -#+NAME: sp_ch_latex_rgx +#+NAME: prgmkup_rgx_in  #+BEGIN_SRC d -static latex_special_char                       = ctRegex!(`([%${}_#&\\])`); -static latex_special_char_for_escape            = ctRegex!(`([%${}_#\\])`); -static latex_special_char_for_escape_and_braces = ctRegex!(`([&])`); -static latex_special_char_for_escape_url        = ctRegex!(`([%])`); -static latex_special_char_escaped               = ctRegex!(`\\([%${}_#\\])`); -static latex_special_char_escaped_braced        = ctRegex!(`[{]\\([&])[}]`); -static latex_identify_inline_link               = ctRegex!(`┥.+?┝┤\S+?├`, "mg"); -static latex_identify_inline_fontface           = ctRegex!(`\\([_#$]┨.+?┣)\\([_#$])`, "mg"); -static latex_clean_internal_link                = ctRegex!(`^(?:#|¤\S+?#)`, "m"); -static latex_clean_bookindex_linebreak          = ctRegex!(`\s*\\\\\\\\\s*`, "m");  #+END_SRC -* 2. ctRegex defaults shared by meta & output (generic)  ** misc generic  #+NAME: prgmkup_rgx_spaces @@ -534,24 +514,6 @@ static nbsp_chars                               = ctRegex!(`[░]+`, "mg");  static middle_dot                               = ctRegex!(`·`, "mg");  #+END_SRC -** filename (& path) (including insert file) :insert:file:path:filename: - -#+NAME: prgmkup_rgx_filename_and_path -#+BEGIN_SRC d -static src_pth_sst_or_ssm                       = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.](?P<extension>ss[tm]))$`); -static src_pth_pod_sst_or_ssm                   = ctRegex!(`^(?P<podpath>[/]?(?:[a-zA-Z0-9._-]+/)*)media/text/[a-z]{2}/(?P<filename>[a-zA-Z0-9][a-zA-Z0-9._-]*?[.]ss[tm])$`); -static src_pth_contents                         = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9][a-zA-Z0-9._-]*)/pod[.]manifest$`); -static src_pth_zip                              = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]zip)$`); -static src_pth_types                            = ctRegex!(`^(?P<path>[/]?[a-zA-Z0-9._-]+/)*(?P<gotfile>(?P<filename>[a-zA-Z0-9._-]+[.]ss[tm])|(?P<filelist>[a-zA-Z0-9._-]+/pod[.]manifest)|(?P<filezip>[a-zA-Z0-9._-]+[.]zip))$`); -static src_fn                                   = -  ctRegex!(`^([/]?(?:[a-zA-Z0-9._-]+/)*)(?P<fn_src>(?P<fn_base>[a-zA-Z0-9._-]+)[.](?P<fn_src_suffix>ss[tm]))$`); -static src_fn_master                            = ctRegex!(`^(?P<path>/?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]ssm)$`); -static src_fn_find_inserts                      = ctRegex!(`^(?P<path>/?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]ss[im])$`); -static insert_src_fn_ssi_or_sst                 = ctRegex!(`^<<\s*(?P<path>[a-zA-Z0-9._-]+/)*(?P<filename>[a-zA-Z0-9._-]+[.]ss[ti])$`); -static src_base_parent_dir_name                 = ctRegex!(`[/](?P<dir>(?:[a-zA-Z0-9._-]+))(?:/media/text/[a-z]{2})$`); // formalizes dir structure -static src_formalised_file_path_parts           = ctRegex!(`(?P<pth>(?:[/a-zA-Z0-9._-]+?)(?P<dir>[a-zA-Z0-9._-]+))(?:/media/text/[a-z]{2})$`); // formalizes dir structure -#+END_SRC -  ** inline markup  *** inline breaks @@ -666,21 +628,21 @@ static quotation_mark_sql_insert_delimiter      = ctRegex!("[']", "mg");  #+NAME: prgmkup_rgx_inline_font_face  #+BEGIN_SRC d  /+ inline markup font face mod +/ -static inline_emphasis                          = ctRegex!(`[*]┨(?P<text>.+?)┣[*]`, "mg"); -static inline_bold                              = ctRegex!(`[!]┨(?P<text>.+?)┣[!]`, "mg"); -static inline_underscore                        = ctRegex!(`[_]┨(?P<text>.+?)┣[_]`, "mg"); -static inline_italics                           = ctRegex!(`[/]┨(?P<text>.+?)┣[/]`, "mg"); -static inline_superscript                       = ctRegex!(`\^┨(?P<text>.+?)┣\^`, "mg"); -static inline_subscript                         = ctRegex!(`[,]┨(?P<text>.+?)┣[,]`, "mg"); -static inline_strike                            = ctRegex!(`[-]┨(?P<text>.+?)┣[-]`, "mg"); -static inline_insert                            = ctRegex!(`[+]┨(?P<text>.+?)┣[+]`, "mg"); -static inline_mono                              = ctRegex!(`[■]┨(?P<text>.+?)┣[■]`, "mg"); -static inline_cite                              = ctRegex!(`[‖]┨(?P<text>.+?)┣[‖]`, "mg"); +static inline_emphasis                          = ctRegex!(`⑆[*]┨(?P<text>.+?)┣[*]`, "mg"); +static inline_bold                              = ctRegex!(`⑆[!]┨(?P<text>.+?)┣[!]`, "mg"); +static inline_underscore                        = ctRegex!(`⑆[_]┨(?P<text>.+?)┣[_]`, "mg"); +static inline_italics                           = ctRegex!(`⑆[/]┨(?P<text>.+?)┣[/]`, "mg"); +static inline_superscript                       = ctRegex!(`⑆\^┨(?P<text>.+?)┣\^`, "mg"); +static inline_subscript                         = ctRegex!(`⑆[,]┨(?P<text>.+?)┣[,]`, "mg"); +static inline_strike                            = ctRegex!(`⑆[-]┨(?P<text>.+?)┣[-]`, "mg"); +static inline_insert                            = ctRegex!(`⑆[+]┨(?P<text>.+?)┣[+]`, "mg"); +static inline_mono                              = ctRegex!(`⑆[■]┨(?P<text>.+?)┣[■]`, "mg"); +static inline_cite                              = ctRegex!(`⑆[‖]┨(?P<text>.+?)┣[‖]`, "mg");  #+END_SRC  #+BEGIN_SRC d -// static inline_superscript                    = ctRegex!(`[\^]┨(?P<text>.+?)┣[\^]`, "mg"); -// static inline_fontface_clean                 = ctRegex!(`[*!_/^,+■‖-]┨|┣[*!_/^,+■‖-]`, "mg"); +// static inline_superscript                             = ctRegex!(`⑆[\^]┨(?P<text>.+?)┣[\^]`, "mg"); +// static inline_fontface_clean                          = ctRegex!(`⑆[*!_/^,+■‖-]┨|┣[*!_/^,+■‖-]`, "mg");  #+END_SRC  *** table related @@ -692,6 +654,126 @@ static table_delimiter_col                      = ctRegex!("[ ]*[┊][ ]*", "mg"  static table_delimiter_row                      = ctRegex!("[ ]*\n", "mg");  #+END_SRC +** files filename (& path) (including insert file) :insert:file:path:filename: + +#+HEADER: :tangle "../src/doc_reform/meta/rgx_files.d" +#+HEADER: :noweb yes +#+BEGIN_SRC d +<<doc_header_including_copyright_and_license>> +/++ +  regex: regular expressions used in sisu document parser ++/ +module doc_reform.meta.rgx_files; +static template spineRgxFiles() { +  static struct RgxFiles { +    <<prgmkup_rgx_filename_and_path>> +    <<meta_rgx_language_codes>> +  } +} +#+END_SRC + +#+NAME: prgmkup_rgx_filename_and_path +#+BEGIN_SRC d +static src_pth_sst_or_ssm                       = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.](?P<extension>ss[tm]))$`); +static src_pth_pod_sst_or_ssm                   = ctRegex!(`^(?P<podpath>[/]?(?:[a-zA-Z0-9._-]+/)*)media/text/[a-z]{2}/(?P<filename>[a-zA-Z0-9][a-zA-Z0-9._-]*?[.]ss[tm])$`); +static src_pth_contents                         = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9][a-zA-Z0-9._-]*)/pod[.]manifest$`); +static src_pth_zip                              = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]zip)$`); +static src_pth_types                            = ctRegex!(`^(?P<path>[/]?[a-zA-Z0-9._-]+/)*(?P<gotfile>(?P<filename>[a-zA-Z0-9._-]+[.]ss[tm])|(?P<filelist>[a-zA-Z0-9._-]+/pod[.]manifest)|(?P<filezip>[a-zA-Z0-9._-]+[.]zip))$`); +static src_fn                                   = ctRegex!(`^([/]?(?:[a-zA-Z0-9._-]+/)*)(?P<fn_src>(?P<fn_base>[a-zA-Z0-9._-]+)[.](?P<fn_src_suffix>ss[tm]))$`); +static src_fn_master                            = ctRegex!(`^(?P<path>/?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]ssm)$`); +static src_fn_find_inserts                      = ctRegex!(`^(?P<path>/?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]ss[im])$`); +static insert_src_fn_ssi_or_sst                 = ctRegex!(`^<<\s*(?P<path>[a-zA-Z0-9._-]+/)*(?P<filename>[a-zA-Z0-9._-]+[.]ss[ti])$`); +static src_base_parent_dir_name                 = ctRegex!(`[/](?P<dir>(?:[a-zA-Z0-9._-]+))(?:/media/text/[a-z]{2})$`); // formalizes dir structure +static src_formalised_file_path_parts           = ctRegex!(`(?P<pth>(?:[/a-zA-Z0-9._-]+?)(?P<dir>[a-zA-Z0-9._-]+))(?:/media/text/[a-z]{2})$`); // formalizes dir structure +#+END_SRC + +** _module template yaml tags + +#+HEADER: :tangle "../src/doc_reform/meta/rgx_yaml_tags.d" +#+HEADER: :noweb yes +#+BEGIN_SRC d +<<doc_header_including_copyright_and_license>> +/++ +  regex: regular expressions used in sisu document parser ++/ +module doc_reform.meta.rgx_yaml; +static template spineRgxYamlTags() { +  static struct RgxYaml { +    <<meta_rgx_yaml>> +  } +} +#+END_SRC + +#+NAME: meta_rgx_yaml +#+BEGIN_SRC d +static yaml_tag_is_str                          = ctRegex!(`:str$`); +static yaml_tag_is_int                          = ctRegex!(`:int$`); +static yaml_tag_is_map                          = ctRegex!(`:map$`); +static yaml_tag_is_seq                          = ctRegex!(`:seq$`); +#+END_SRC + +** special characters +*** xhtml special characters template + +#+HEADER: :tangle "../src/doc_reform/io_out/rgx_xhtml.d" +#+HEADER: :noweb yes +#+BEGIN_SRC d +<<doc_header_including_copyright_and_license>> +/++ +  regex: regular expressions used in sisu document parser ++/ +module doc_reform.io_out.rgx_xhtml; +static template spineRgxXHTML() { +  static struct RgxXHTML { +    <<sp_ch_xhtml_rgx>> +  } +} +#+END_SRC + +*** xhtml special characters + +#+NAME: sp_ch_xhtml_rgx +#+BEGIN_SRC d +static ampersand                                = ctRegex!(`[&]`, "m");      // & +static quotation                                = ctRegex!(`["]`, "m");      // " +static less_than                                = ctRegex!(`[<]`, "m");      // < +static greater_than                             = ctRegex!(`[>]`, "m");      // > +static line_break                               = ctRegex!(` [\\]{2}`, "m"); // <br /> +#+END_SRC + +*** LaTeX special characters template + +#+HEADER: :tangle "../src/doc_reform/io_out/rgx_latex.d" +#+HEADER: :noweb yes +#+BEGIN_SRC d +<<doc_header_including_copyright_and_license>> +/++ +  regex: regular expressions used in sisu document parser ++/ +module doc_reform.io_out.rgx_latex; +static template spineRgxLSC() { +  static struct RgxLSC { +    <<sp_ch_latex_rgx>> +  } +} +#+END_SRC + +*** latex special characters + +#+NAME: sp_ch_latex_rgx +#+BEGIN_SRC d +static latex_special_char                       = ctRegex!(`([%${}_#&\\])`); +static latex_special_char_for_escape            = ctRegex!(`([%${}_#\\])`); +static latex_special_char_for_escape_and_braces = ctRegex!(`([&])`); +static latex_special_char_for_escape_url        = ctRegex!(`([%])`); +static latex_special_char_escaped               = ctRegex!(`\\([%${}_#\\])`); +static latex_special_char_escaped_braced        = ctRegex!(`[{]\\([&])[}]`); +static latex_identify_inline_link               = ctRegex!(`┥.+?┝┤\S+?├`, "mg"); +static latex_identify_inline_fontface           = ctRegex!(`\\([_#$]┨.+?┣)\\([_#$])`, "mg"); +static latex_clean_internal_link                = ctRegex!(`^(?:#|¤\S+?#)`, "m"); +static latex_clean_bookindex_linebreak          = ctRegex!(`\s*\\\\\\\\\s*`, "m"); +#+END_SRC +  * document header including copyright & license  #+NAME: doc_header_including_copyright_and_license  | 
