diff options
author | Ralph Amissah <ralph.amissah@gmail.com> | 2022-11-25 22:06:40 -0500 |
---|---|---|
committer | Ralph Amissah <ralph.amissah@gmail.com> | 2022-12-23 18:17:41 -0500 |
commit | f6d28b62f0e02b8a88a1832589e203c7a613f45b (patch) | |
tree | b5d6462e45bae998190194784e02b143a83f79a3 /org/default_regex.org | |
parent | gitignore & things nix (diff) |
regex review, match speed & compile time, ctregex
- improve match time
- add interim fontface identifier marker
- improve compile time
- remove unused regexs
- separate out some specialized output matches
Diffstat (limited to 'org/default_regex.org')
-rw-r--r-- | org/default_regex.org | 198 |
1 files changed, 140 insertions, 58 deletions
diff --git a/org/default_regex.org b/org/default_regex.org index 89d6ea3..976baa0 100644 --- a/org/default_regex.org +++ b/org/default_regex.org @@ -67,7 +67,6 @@ static template spineRgxIn() { <<meta_rgx_bibliography>> <<meta_rgx_book_index_split>> <<meta_rgx_topic_register_split>> - <<meta_rgx_language_codes>> <<prgmkup_rgx_spaces>> <<prgmkup_rgx_filename_and_path>> <<prgmkup_rgx_inline_breaks>> @@ -86,10 +85,6 @@ static template spineRgxIn() { /+ misc +/ static flag_action = ctRegex!(`^(--[a-z][a-z0-9-]+)$`); static within_quotes = ctRegex!(`"(.+?)"`, "m"); -static yaml_tag_is_str = ctRegex!(`:str$`); -static yaml_tag_is_int = ctRegex!(`:int$`); -static yaml_tag_is_map = ctRegex!(`:map$`); -static yaml_tag_is_seq = ctRegex!(`:seq$`); static make_heading_delimiter = ctRegex!(`[;][ ]*`); static arr_delimiter = ctRegex!(`[ ]*[;][ ]*`); static name_delimiter = ctRegex!(`^([^,]+)[ ]*,[ ]+(.+?)$`); @@ -476,8 +471,6 @@ static template spineRgxOut() { <<prgmkup_rgx_inline_links>> <<prgmkup_rgx_inline_font_face>> <<prgmkup_rgx_table>> - <<sp_ch_xhtml_rgx>> - <<sp_ch_latex_rgx>> <<grouped_text_rgx_paragraph_marks>> } } @@ -492,35 +485,22 @@ static make_breakpage = ctRegex!(`new=(?P<breakpage>.+ static make_breakcolumn = ctRegex!(`break=(?P<breakcolumn>.+?)(?:;|$)`,); #+END_SRC -** special characters -*** xhtml special characters +* 2. ctRegex defaults shared by meta & output (generic) -#+NAME: sp_ch_xhtml_rgx +** meta + +#+NAME: prgmkup_rgx_meta #+BEGIN_SRC d -static xhtml_ampersand = ctRegex!(`[&]`, "m"); // & -static xhtml_quotation = ctRegex!(`["]`, "m"); // " -static xhtml_less_than = ctRegex!(`[<]`, "m"); // < -static xhtml_greater_than = ctRegex!(`[>]`, "m"); // > -static xhtml_line_break = ctRegex!(` [\\]{2}`, "m"); // <br /> +static space = ctRegex!(`[ ]`, "mg"); +static spaces_keep = ctRegex!(`(?P<keep_spaces>^[ ]+|[ ]{2,})`, "mg"); // code, verse, block #+END_SRC -*** latex special characters +** spine & source_in -#+NAME: sp_ch_latex_rgx +#+NAME: prgmkup_rgx_in #+BEGIN_SRC d -static latex_special_char = ctRegex!(`([%${}_#&\\])`); -static latex_special_char_for_escape = ctRegex!(`([%${}_#\\])`); -static latex_special_char_for_escape_and_braces = ctRegex!(`([&])`); -static latex_special_char_for_escape_url = ctRegex!(`([%])`); -static latex_special_char_escaped = ctRegex!(`\\([%${}_#\\])`); -static latex_special_char_escaped_braced = ctRegex!(`[{]\\([&])[}]`); -static latex_identify_inline_link = ctRegex!(`┥.+?┝┤\S+?├`, "mg"); -static latex_identify_inline_fontface = ctRegex!(`\\([_#$]┨.+?┣)\\([_#$])`, "mg"); -static latex_clean_internal_link = ctRegex!(`^(?:#|¤\S+?#)`, "m"); -static latex_clean_bookindex_linebreak = ctRegex!(`\s*\\\\\\\\\s*`, "m"); #+END_SRC -* 2. ctRegex defaults shared by meta & output (generic) ** misc generic #+NAME: prgmkup_rgx_spaces @@ -534,24 +514,6 @@ static nbsp_chars = ctRegex!(`[░]+`, "mg"); static middle_dot = ctRegex!(`·`, "mg"); #+END_SRC -** filename (& path) (including insert file) :insert:file:path:filename: - -#+NAME: prgmkup_rgx_filename_and_path -#+BEGIN_SRC d -static src_pth_sst_or_ssm = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.](?P<extension>ss[tm]))$`); -static src_pth_pod_sst_or_ssm = ctRegex!(`^(?P<podpath>[/]?(?:[a-zA-Z0-9._-]+/)*)media/text/[a-z]{2}/(?P<filename>[a-zA-Z0-9][a-zA-Z0-9._-]*?[.]ss[tm])$`); -static src_pth_contents = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9][a-zA-Z0-9._-]*)/pod[.]manifest$`); -static src_pth_zip = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]zip)$`); -static src_pth_types = ctRegex!(`^(?P<path>[/]?[a-zA-Z0-9._-]+/)*(?P<gotfile>(?P<filename>[a-zA-Z0-9._-]+[.]ss[tm])|(?P<filelist>[a-zA-Z0-9._-]+/pod[.]manifest)|(?P<filezip>[a-zA-Z0-9._-]+[.]zip))$`); -static src_fn = - ctRegex!(`^([/]?(?:[a-zA-Z0-9._-]+/)*)(?P<fn_src>(?P<fn_base>[a-zA-Z0-9._-]+)[.](?P<fn_src_suffix>ss[tm]))$`); -static src_fn_master = ctRegex!(`^(?P<path>/?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]ssm)$`); -static src_fn_find_inserts = ctRegex!(`^(?P<path>/?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]ss[im])$`); -static insert_src_fn_ssi_or_sst = ctRegex!(`^<<\s*(?P<path>[a-zA-Z0-9._-]+/)*(?P<filename>[a-zA-Z0-9._-]+[.]ss[ti])$`); -static src_base_parent_dir_name = ctRegex!(`[/](?P<dir>(?:[a-zA-Z0-9._-]+))(?:/media/text/[a-z]{2})$`); // formalizes dir structure -static src_formalised_file_path_parts = ctRegex!(`(?P<pth>(?:[/a-zA-Z0-9._-]+?)(?P<dir>[a-zA-Z0-9._-]+))(?:/media/text/[a-z]{2})$`); // formalizes dir structure -#+END_SRC - ** inline markup *** inline breaks @@ -666,21 +628,21 @@ static quotation_mark_sql_insert_delimiter = ctRegex!("[']", "mg"); #+NAME: prgmkup_rgx_inline_font_face #+BEGIN_SRC d /+ inline markup font face mod +/ -static inline_emphasis = ctRegex!(`[*]┨(?P<text>.+?)┣[*]`, "mg"); -static inline_bold = ctRegex!(`[!]┨(?P<text>.+?)┣[!]`, "mg"); -static inline_underscore = ctRegex!(`[_]┨(?P<text>.+?)┣[_]`, "mg"); -static inline_italics = ctRegex!(`[/]┨(?P<text>.+?)┣[/]`, "mg"); -static inline_superscript = ctRegex!(`\^┨(?P<text>.+?)┣\^`, "mg"); -static inline_subscript = ctRegex!(`[,]┨(?P<text>.+?)┣[,]`, "mg"); -static inline_strike = ctRegex!(`[-]┨(?P<text>.+?)┣[-]`, "mg"); -static inline_insert = ctRegex!(`[+]┨(?P<text>.+?)┣[+]`, "mg"); -static inline_mono = ctRegex!(`[■]┨(?P<text>.+?)┣[■]`, "mg"); -static inline_cite = ctRegex!(`[‖]┨(?P<text>.+?)┣[‖]`, "mg"); +static inline_emphasis = ctRegex!(`⑆[*]┨(?P<text>.+?)┣[*]`, "mg"); +static inline_bold = ctRegex!(`⑆[!]┨(?P<text>.+?)┣[!]`, "mg"); +static inline_underscore = ctRegex!(`⑆[_]┨(?P<text>.+?)┣[_]`, "mg"); +static inline_italics = ctRegex!(`⑆[/]┨(?P<text>.+?)┣[/]`, "mg"); +static inline_superscript = ctRegex!(`⑆\^┨(?P<text>.+?)┣\^`, "mg"); +static inline_subscript = ctRegex!(`⑆[,]┨(?P<text>.+?)┣[,]`, "mg"); +static inline_strike = ctRegex!(`⑆[-]┨(?P<text>.+?)┣[-]`, "mg"); +static inline_insert = ctRegex!(`⑆[+]┨(?P<text>.+?)┣[+]`, "mg"); +static inline_mono = ctRegex!(`⑆[■]┨(?P<text>.+?)┣[■]`, "mg"); +static inline_cite = ctRegex!(`⑆[‖]┨(?P<text>.+?)┣[‖]`, "mg"); #+END_SRC #+BEGIN_SRC d -// static inline_superscript = ctRegex!(`[\^]┨(?P<text>.+?)┣[\^]`, "mg"); -// static inline_fontface_clean = ctRegex!(`[*!_/^,+■‖-]┨|┣[*!_/^,+■‖-]`, "mg"); +// static inline_superscript = ctRegex!(`⑆[\^]┨(?P<text>.+?)┣[\^]`, "mg"); +// static inline_fontface_clean = ctRegex!(`⑆[*!_/^,+■‖-]┨|┣[*!_/^,+■‖-]`, "mg"); #+END_SRC *** table related @@ -692,6 +654,126 @@ static table_delimiter_col = ctRegex!("[ ]*[┊][ ]*", "mg" static table_delimiter_row = ctRegex!("[ ]*\n", "mg"); #+END_SRC +** files filename (& path) (including insert file) :insert:file:path:filename: + +#+HEADER: :tangle "../src/doc_reform/meta/rgx_files.d" +#+HEADER: :noweb yes +#+BEGIN_SRC d +<<doc_header_including_copyright_and_license>> +/++ + regex: regular expressions used in sisu document parser ++/ +module doc_reform.meta.rgx_files; +static template spineRgxFiles() { + static struct RgxFiles { + <<prgmkup_rgx_filename_and_path>> + <<meta_rgx_language_codes>> + } +} +#+END_SRC + +#+NAME: prgmkup_rgx_filename_and_path +#+BEGIN_SRC d +static src_pth_sst_or_ssm = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.](?P<extension>ss[tm]))$`); +static src_pth_pod_sst_or_ssm = ctRegex!(`^(?P<podpath>[/]?(?:[a-zA-Z0-9._-]+/)*)media/text/[a-z]{2}/(?P<filename>[a-zA-Z0-9][a-zA-Z0-9._-]*?[.]ss[tm])$`); +static src_pth_contents = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9][a-zA-Z0-9._-]*)/pod[.]manifest$`); +static src_pth_zip = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]zip)$`); +static src_pth_types = ctRegex!(`^(?P<path>[/]?[a-zA-Z0-9._-]+/)*(?P<gotfile>(?P<filename>[a-zA-Z0-9._-]+[.]ss[tm])|(?P<filelist>[a-zA-Z0-9._-]+/pod[.]manifest)|(?P<filezip>[a-zA-Z0-9._-]+[.]zip))$`); +static src_fn = ctRegex!(`^([/]?(?:[a-zA-Z0-9._-]+/)*)(?P<fn_src>(?P<fn_base>[a-zA-Z0-9._-]+)[.](?P<fn_src_suffix>ss[tm]))$`); +static src_fn_master = ctRegex!(`^(?P<path>/?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]ssm)$`); +static src_fn_find_inserts = ctRegex!(`^(?P<path>/?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]ss[im])$`); +static insert_src_fn_ssi_or_sst = ctRegex!(`^<<\s*(?P<path>[a-zA-Z0-9._-]+/)*(?P<filename>[a-zA-Z0-9._-]+[.]ss[ti])$`); +static src_base_parent_dir_name = ctRegex!(`[/](?P<dir>(?:[a-zA-Z0-9._-]+))(?:/media/text/[a-z]{2})$`); // formalizes dir structure +static src_formalised_file_path_parts = ctRegex!(`(?P<pth>(?:[/a-zA-Z0-9._-]+?)(?P<dir>[a-zA-Z0-9._-]+))(?:/media/text/[a-z]{2})$`); // formalizes dir structure +#+END_SRC + +** _module template yaml tags + +#+HEADER: :tangle "../src/doc_reform/meta/rgx_yaml_tags.d" +#+HEADER: :noweb yes +#+BEGIN_SRC d +<<doc_header_including_copyright_and_license>> +/++ + regex: regular expressions used in sisu document parser ++/ +module doc_reform.meta.rgx_yaml; +static template spineRgxYamlTags() { + static struct RgxYaml { + <<meta_rgx_yaml>> + } +} +#+END_SRC + +#+NAME: meta_rgx_yaml +#+BEGIN_SRC d +static yaml_tag_is_str = ctRegex!(`:str$`); +static yaml_tag_is_int = ctRegex!(`:int$`); +static yaml_tag_is_map = ctRegex!(`:map$`); +static yaml_tag_is_seq = ctRegex!(`:seq$`); +#+END_SRC + +** special characters +*** xhtml special characters template + +#+HEADER: :tangle "../src/doc_reform/io_out/rgx_xhtml.d" +#+HEADER: :noweb yes +#+BEGIN_SRC d +<<doc_header_including_copyright_and_license>> +/++ + regex: regular expressions used in sisu document parser ++/ +module doc_reform.io_out.rgx_xhtml; +static template spineRgxXHTML() { + static struct RgxXHTML { + <<sp_ch_xhtml_rgx>> + } +} +#+END_SRC + +*** xhtml special characters + +#+NAME: sp_ch_xhtml_rgx +#+BEGIN_SRC d +static ampersand = ctRegex!(`[&]`, "m"); // & +static quotation = ctRegex!(`["]`, "m"); // " +static less_than = ctRegex!(`[<]`, "m"); // < +static greater_than = ctRegex!(`[>]`, "m"); // > +static line_break = ctRegex!(` [\\]{2}`, "m"); // <br /> +#+END_SRC + +*** LaTeX special characters template + +#+HEADER: :tangle "../src/doc_reform/io_out/rgx_latex.d" +#+HEADER: :noweb yes +#+BEGIN_SRC d +<<doc_header_including_copyright_and_license>> +/++ + regex: regular expressions used in sisu document parser ++/ +module doc_reform.io_out.rgx_latex; +static template spineRgxLSC() { + static struct RgxLSC { + <<sp_ch_latex_rgx>> + } +} +#+END_SRC + +*** latex special characters + +#+NAME: sp_ch_latex_rgx +#+BEGIN_SRC d +static latex_special_char = ctRegex!(`([%${}_#&\\])`); +static latex_special_char_for_escape = ctRegex!(`([%${}_#\\])`); +static latex_special_char_for_escape_and_braces = ctRegex!(`([&])`); +static latex_special_char_for_escape_url = ctRegex!(`([%])`); +static latex_special_char_escaped = ctRegex!(`\\([%${}_#\\])`); +static latex_special_char_escaped_braced = ctRegex!(`[{]\\([&])[}]`); +static latex_identify_inline_link = ctRegex!(`┥.+?┝┤\S+?├`, "mg"); +static latex_identify_inline_fontface = ctRegex!(`\\([_#$]┨.+?┣)\\([_#$])`, "mg"); +static latex_clean_internal_link = ctRegex!(`^(?:#|¤\S+?#)`, "m"); +static latex_clean_bookindex_linebreak = ctRegex!(`\s*\\\\\\\\\s*`, "m"); +#+END_SRC + * document header including copyright & license #+NAME: doc_header_including_copyright_and_license |