\ Regex - ANS Forth Regular Expression package including String Builder
\ Version 0.8
\ Copyright (C) Gerry Jackson 2010
\ This software is free; you can redistribute it and/or modify it in
\ any way provided you acknowledge the original source and copyright
\ and keep this notice with the source code.
\ This program is distributed in the hope that it will be useful,
\ but WITHOUT ANY WARRANTY; without even the implied warranty of
\ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
\ ------------------------------------------------------------------------------
base @ constant rgx-user-base decimal
\ ------------------------------------------------------------------------------
\ Provides ANS compatibility for some common but non-standard words
s" [undefined]" pad c! pad char+ pad c@ move
pad find nip 0=
[if]
: [undefined] ( "name" -- flag )
bl word find nip 0=
; immediate
[then]
[undefined] [defined]
[if] : [defined] postpone [undefined] 0= ; immediate [then]
[undefined] -rot [if] : -rot rot rot ; [then]
[undefined] <= [if] : <= > 0= ; [then]
[undefined] >= [if] : >= < 0= ; [then]
[undefined] endif [if] : endif postpone then ; immediate [then]
[undefined] on [if] : on ( ad -- ) -1 swap ! ; [then]
[undefined] off [if] : off ( ad -- ) 0 swap ! ; [then]
[undefined] defer [if] : defer create ( "name" -- )
['] abort , does> @ execute ; [then]
[undefined] is [if]
: is ( xt "name" -- )
' ( xt xt2)
state @ if
postpone literal postpone >body postpone !
else
>body !
then ; immediate
[then]
[undefined] parse-name
[if] \ From Forth 200X web site
: isspace? ( c -- f ) bl 1+ u< ;
: isnotspace? ( c -- f ) isspace? 0= ;
: xt-skip ( addr1 n1 xt -- addr2 n2 ) \ gforth
\ skip all characters satisfying xt ( c -- f )
>r
begin
dup
while
over c@ r@ execute
while
1 /string
repeat then
r> drop
;
: parse-name ( "name" -- c-addr u )
source >in @ /string
['] isspace? xt-skip over >r
['] isnotspace? xt-skip ( end-word restlen r: start-word )
2dup 1 min + source drop - >in !
drop r> tuck -
;
[then]
s" xmini_oof.fth" included
s" sets.fth" included
[undefined] 1Cell [if] 1 cells constant 1Cell [then]
[undefined] ]] [if]
\ Postponing a series of words e.g. : abc ... ]] (postponed words) [[ ... ;
\ Only handles words on 1 line, could improve to be multi-line
\ Also there are problems with 'ing a word with undefined interpretation
\ semantics (ambiguous condition)
: [[ ; \ Stop postponing
: ]]
begin
>in @ ' ['] [[ <>
while
>in ! postpone postpone
repeat
drop
; immediate
[then]
\ PFE bug avoidance - fails with a negative offset to /string
s" hello!" 4 /string -4 /string nip 0=
[if] \ Thanks to David Williams
\ Assume that len >= 0 and (len -i) >= 0:
: /string ( addr len i -- addr+c[i] len-i )
>r r@ - swap r> chars + swap
;
[then]
\ ------------------------------------------------------------------------------
\ State transition tables for a lexical scanner
\ Generated by LexGen
\ See http://www.qlikz.org/forth/lexgen/lexgen.html
base @ decimal : ~ 0 0 parse-name >number dup >r dup 1 min
/string >number 2drop drop r> if negate then , ; 249 value #states 36 base !
0 value BaseDefaultData here to BaseDefaultData ~ -1J ~ -1 ~ 0 ~ 7Y ~ -1 ~ 0
~ 61 ~ -1 ~ 0 ~ 5J ~ -1 ~ 0 ~ 18 ~ -1 ~ 0 ~ 0 ~ -1 ~ 2 ~ 3B ~ -1 ~ 1P ~ 0 ~ -1
~ 1Q ~ 0 ~ -1 ~ 3 ~ 45 ~ -1 ~ 0 ~ 0 ~ -1 ~ 2 ~ 0 ~ -1 ~ 1G ~ 0 ~ -1 ~ 3B ~ -F
~ -1 ~ 1P ~ 0 ~ -1 ~ 1Q ~ 1Q ~ 1E ~ 0 ~ -28 ~ -1 ~ 3C ~ 0 ~ -1 ~ 3 ~ 48 ~ -1
~ 2T ~ 0 ~ -1 ~ 2 ~ 0 ~ -1 ~ 1G ~ 15 ~ -1 ~ 1P ~ 0 ~ -1 ~ 1Q ~ 1S ~ -1 ~ 4
~ -D ~ -1 ~ 5 ~ 0 ~ -1 ~ M ~ -E ~ -1 ~ 6 ~ U ~ -1 ~ N ~ -W ~ 2M ~ 0 ~ 0 ~ -1
~ 1E ~ 6F ~ -1 ~ 0 ~ 0 ~ -1 ~ 3 ~ 49 ~ -1 ~ 2T ~ 0 ~ -1 ~ 2I ~ 0 ~ -1 ~ 2F
~ 2X ~ -1 ~ 0 ~ 0 ~ -1 ~ 2G ~ 0 ~ -1 ~ 2T ~ 0 ~ -1 ~ 0 ~ 2 ~ -1 ~ 1U ~ 0 ~ -1
~ 1X ~ 7G ~ -1 ~ 1U ~ 0 ~ -1 ~ 1C ~ 0 ~ -1 ~ 1D ~ 0 ~ -1 ~ 14 ~ 0 ~ -1 ~ 35
~ 0 ~ -1 ~ 11 ~ 0 ~ -1 ~ 12 ~ 0 ~ -1 ~ 15 ~ 0 ~ 1E ~ 2U ~ 7T ~ -1 ~ 30 ~ 0 ~ -1
~ 2Z ~ 0 ~ -1 ~ 32 ~ 0 ~ -1 ~ 2W ~ 0 ~ -1 ~ 33 ~ 0 ~ -1 ~ 34 ~ 0 ~ -1 ~ 2Y
~ 0 ~ -1 ~ 1A ~ 0 ~ -1 ~ 2J ~ 0 ~ -1 ~ 2K ~ 0 ~ -1 ~ 2L ~ 0 ~ -1 ~ 2M ~ 18 ~ -1
~ 0 ~ 0 ~ -1 ~ 2V ~ 0 ~ -1 ~ 2N ~ 0 ~ -1 ~ 2O ~ 0 ~ -1 ~ V ~ 0 ~ -1 ~ 2P ~ 0
~ -1 ~ 2X ~ 0 ~ -1 ~ 2Q ~ 0 ~ -1 ~ X ~ 0 ~ -1 ~ 16 ~ 0 ~ -1 ~ Y ~ 8B ~ -1 ~ 0
~ 0 ~ -1 ~ 1X ~ 5P ~ -1 ~ 1U ~ 0 ~ -1 ~ G ~ 0 ~ -1 ~ A ~ 0 ~ -1 ~ H ~ 0 ~ -1
~ B ~ 0 ~ -1 ~ I ~ 0 ~ -1 ~ C ~ 0 ~ -1 ~ O ~ 0 ~ -1 ~ Q ~ 0 ~ -1 ~ P ~ 0 ~ -1
~ 1C ~ 0 ~ -1 ~ 1D ~ 0 ~ -1 ~ 14 ~ 0 ~ -1 ~ 11 ~ 0 ~ -1 ~ 12 ~ 0 ~ -1 ~ 17
~ 0 ~ -1 ~ 18 ~ 0 ~ -1 ~ 15 ~ 0 ~ 2M ~ 2U ~ 8L ~ -1 ~ 30 ~ 0 ~ -1 ~ 19 ~ 0 ~ -1
~ 1F ~ 0 ~ -1 ~ 1K ~ 0 ~ -1 ~ S ~ 0 ~ -1 ~ 2Z ~ 0 ~ -1 ~ 2W ~ 0 ~ -1 ~ W ~ 0
~ -1 ~ 2Y ~ 0 ~ -1 ~ U ~ 0 ~ -1 ~ 1I ~ 0 ~ -1 ~ Z ~ 0 ~ -1 ~ 1A ~ 0 ~ -1 ~ 10
~ 0 ~ -1 ~ 13 ~ 0 ~ -1 ~ 2J ~ 0 ~ -1 ~ 1J ~ 0 ~ -1 ~ 2K ~ 0 ~ -1 ~ R ~ 0 ~ -1
~ 2L ~ 0 ~ -1 ~ 2M ~ 1F ~ -1 ~ 0 ~ 0 ~ -1 ~ 2V ~ 0 ~ -1 ~ 2N ~ 0 ~ -1 ~ 2O
~ 0 ~ -1 ~ V ~ 0 ~ -1 ~ 2P ~ 0 ~ -1 ~ 2X ~ 0 ~ -1 ~ 2Q ~ 0 ~ -1 ~ T ~ 1P ~ -1
~ 2R ~ 0 ~ -1 ~ 1H ~ 0 ~ -1 ~ X ~ 0 ~ -1 ~ 16 ~ 0 ~ -1 ~ Y ~ -T ~ -1 ~ 0 ~ 0
~ -1 ~ 1X ~ 0 ~ -1 ~ 14 ~ 0 ~ -1 ~ 1B ~ 0 ~ -1 ~ 2U ~ 0 ~ -1 ~ S ~ 0 ~ -1 ~ W
~ 0 ~ -1 ~ U ~ 0 ~ -1 ~ 1A ~ 0 ~ -1 ~ 10 ~ 0 ~ -1 ~ 13 ~ 0 ~ -1 ~ 2J ~ 0 ~ -1
~ 1J ~ 0 ~ -1 ~ 2K ~ 0 ~ -1 ~ R ~ 0 ~ -1 ~ 2L ~ 0 ~ -1 ~ 2M ~ 0 ~ -1 ~ 2N ~ 0
~ -1 ~ 2O ~ 0 ~ -1 ~ V ~ 0 ~ -1 ~ 2P ~ 0 ~ -1 ~ 2Q ~ 0 ~ -1 ~ T ~ A ~ -1 ~ 2R
~ -1B ~ -1 ~ 0 ~ 36 ~ -1 ~ 0 ~ 4A ~ -1 ~ 0 ~ 2W ~ -1 ~ 0 ~ 0 ~ -1 ~ 1V ~ 0 ~ -1
~ 3A ~ -1L ~ -1 ~ 0 ~ 1I ~ -1 ~ 0 ~ 2V ~ -1 ~ 0 ~ 2T ~ -1 ~ 0 ~ -N ~ -1 ~ 0
~ 0 ~ -1 ~ 1V ~ 0 ~ -1 ~ 31 ~ 65 ~ -1 ~ 0 ~ 0 ~ -1 ~ 1M ~ -2V ~ -1 ~ 0 ~ 0 ~ -1
~ 1R ~ 13 ~ -1 ~ 1S ~ 0 ~ -1 ~ 1L ~ 0 ~ -1 ~ 1T ~ -1A ~ -1 ~ 0 ~ 12 ~ -1 ~ 0
~ 2R ~ -1 ~ 0 ~ 2U ~ -1 ~ 0 ~ N ~ -1 ~ 0 ~ 32 ~ -1 ~ 0 ~ 0 ~ -1 ~ 1V ~ 0 ~ -1
~ 31 ~ 0 ~ -1 ~ 2S ~ -3G ~ 63 ~ 0 ~ 1U ~ -1 ~ 9 ~ 0 ~ -1 ~ 2S ~ D ~ -1 ~ 0
~ 4 ~ -1 ~ 0 ~ 3H ~ -1 ~ 0 ~ 0 ~ -1 ~ 36 ~ 0 ~ -1 ~ 38 ~ 0 ~ -1 ~ 2A ~ 0 ~ -1
~ 2B ~ -Y ~ -1 ~ 0 ~ 2H ~ -1 ~ 0 ~ 2J ~ -1 ~ 0 ~ 0 ~ -1 ~ 36 ~ 0 ~ -1 ~ 2E
~ -2F ~ -1 ~ 0 ~ 0 ~ -1 ~ 38 ~ 0 ~ -1 ~ 2A ~ 0 ~ -1 ~ 2B ~ -3C ~ -1 ~ 0 ~ G
~ -1 ~ 0 ~ 1Y ~ -1 ~ 0 ~ 39 ~ -1 ~ 0 ~ 2M ~ -1 ~ 0 ~ 0 ~ -1 ~ 1O ~ 0 ~ -1 ~ 1N
~ 0 ~ -1 ~ 1W ~ 0 ~ -1 ~ 2E ~ 1K ~ -1 ~ 0 ~ 0 ~ -1 ~ 1Y ~ 0 ~ -1 ~ 1Z ~ 0 ~ -1
~ 22 ~ 0 ~ -1 ~ 23 ~ 0 ~ -1 ~ 26 ~ 0 ~ -1 ~ 27 ~ 0 ~ -1 ~ 2A ~ 0 ~ -1 ~ 2B
~ 1S ~ -1 ~ 0 ~ 3K ~ -1 ~ 8 ~ 0 ~ -1 ~ L ~ 0 ~ -1 ~ F ~ 0 ~ -1 ~ 37 ~ 0 ~ -1
~ 39 ~ 0 ~ -1 ~ 2C ~ 0 ~ -1 ~ 2D ~ 0 ~ -1 ~ 37 ~ 0 ~ -1 ~ 39 ~ 0 ~ -1 ~ 2C
~ 0 ~ -1 ~ 2D ~ 3W ~ -1 ~ 0 ~ 0 ~ -1 ~ 9 ~ 0 ~ -1 ~ 20 ~ 0 ~ -1 ~ 21 ~ 0 ~ -1
~ 24 ~ 0 ~ -1 ~ 25 ~ 0 ~ -1 ~ 28 ~ 0 ~ -1 ~ 29 ~ 0 ~ -1 ~ 2C ~ 0 ~ -1 ~ 2D
~ -9 ~ -1 ~ 0 ~ 1R ~ -1 ~ 7 ~ 0 ~ -1 ~ K ~ 0 ~ -1 ~ E ~ 0 ~ -1 ~ 2E ~ 0 ~ -1
~ 2E ~ 0 ~ -1 ~ J ~ 0 ~ -1 ~ D decimal 412 value maxCheck 36 base !
0 value CheckNextData here to CheckNextData ~ S ~ 2D ~ 52 ~ 64 ~ 4O ~ 5M ~ S
~ 2E ~ S ~ 2F ~ 5L ~ 6G ~ 4O ~ 5N ~ 5C ~ 6B ~ S ~ 2G ~ S ~ 2H ~ S ~ 2I ~ S ~ 2J
~ 4O ~ 5O ~ 5H ~ 6F ~ S ~ 2K ~ 3L ~ 52 ~ S ~ 2L ~ 4O ~ 5P ~ 4J ~ 5J ~ 3L ~ 3L
~ 3L ~ 3L ~ 3L ~ 3L ~ 3L ~ 3L ~ 3L ~ 3L ~ 3L ~ 3L ~ 3L ~ 3L ~ 3L ~ 3L ~ 3L ~ 3L
~ 3L ~ 3L ~ Q ~ 28 ~ O ~ 26 ~ S ~ 2N ~ 6P ~ 6U ~ S ~ 2O ~ S ~ 2P ~ 4J ~ 5K ~ S
~ 2Q ~ S ~ 2R ~ 0 ~ 1 ~ 0 ~ 2 ~ 0 ~ 3 ~ 0 ~ 4 ~ 4F ~ 5C ~ G ~ 21 ~ S ~ 2S ~ 56
~ 68 ~ 4F ~ 5D ~ 13 ~ 49 ~ D ~ 15 ~ Q ~ 29 ~ O ~ 27 ~ S ~ 2T ~ 49 ~ 55 ~ S ~ 2U
~ 55 ~ 67 ~ S ~ 2V ~ 49 ~ 56 ~ 5M ~ 6H ~ S ~ 2W ~ S ~ 2X ~ S ~ 2Y ~ S ~ 2Z ~ S
~ 30 ~ 4F ~ 5E ~ 4X ~ 5Z ~ S ~ 31 ~ S ~ 32 ~ S ~ 33 ~ S ~ 34 ~ S ~ 35 ~ S ~ 36
~ S ~ 37 ~ 4Q ~ 5Q ~ 49 ~ 57 ~ 5M ~ 6I ~ R ~ 2A ~ S ~ 38 ~ 4T ~ 5S ~ S ~ 39
~ 4U ~ 5T ~ 4 ~ X ~ 4X ~ 60 ~ S ~ 3A ~ S ~ 3B ~ S ~ 3C ~ S ~ 3D ~ S ~ 3E ~ S
~ 3F ~ S ~ 3G ~ 4 ~ Y ~ S ~ 3H ~ S ~ 3I ~ S ~ 3J ~ S ~ 3K ~ F ~ 16 ~ 4G ~ 5F
~ 3L ~ 53 ~ F ~ 17 ~ F ~ 18 ~ F ~ 19 ~ 4Q ~ 5R ~ 13 ~ 4A ~ F ~ 1A ~ F ~ 1B ~ L
~ 23 ~ 13 ~ 4B ~ 6Q ~ 6V ~ N ~ 24 ~ F ~ 1C ~ 53 ~ 65 ~ F ~ 1D ~ 5N ~ 6J ~ 63
~ 63 ~ 63 ~ 63 ~ 63 ~ 63 ~ 63 ~ 63 ~ 63 ~ 63 ~ 63 ~ 63 ~ 63 ~ 63 ~ 63 ~ 63 ~ 63
~ 63 ~ 63 ~ 63 ~ 13 ~ 4C ~ R ~ 2B ~ R ~ 2C ~ 13 ~ 4D ~ 6Q ~ 6W ~ N ~ 25 ~ 5N
~ 6K ~ 53 ~ 66 ~ 5D ~ 6C ~ F ~ 1F ~ 5E ~ 6D ~ 48 ~ 54 ~ F ~ 1G ~ 5P ~ 6N ~ 4
~ Z ~ 4 ~ 10 ~ F ~ 1H ~ F ~ 1I ~ 4V ~ 5V ~ Z ~ 3N ~ 4I ~ 5I ~ 4W ~ 5X ~ 4H ~ 5G
~ 4C ~ 5A ~ F ~ 1J ~ F ~ 1K ~ 4U ~ 5U ~ 5E ~ 6E ~ Z ~ 3O ~ 4Y ~ 61 ~ 5P ~ 6O
~ Z ~ 3P ~ F ~ 1L ~ 4A ~ 58 ~ 5U ~ 6P ~ 4V ~ 5W ~ 5O ~ 6L ~ F ~ 1M ~ 4W ~ 5Y
~ F ~ 1N ~ 4C ~ 5B ~ F ~ 1O ~ F ~ 1P ~ F ~ 1Q ~ 57 ~ 69 ~ 1Q ~ 4L ~ 4Y ~ 62
~ 4 ~ 11 ~ F ~ 1R ~ 64 ~ 6R ~ F ~ 1S ~ Z ~ 3Q ~ 37 ~ 50 ~ 5O ~ 6M ~ F ~ 1T ~ F
~ 1U ~ F ~ 1V ~ F ~ 1W ~ F ~ 1X ~ 6F ~ 6T ~ 6 ~ 13 ~ 57 ~ 6A ~ 3G ~ 51 ~ F ~ 1Y
~ F ~ 1Z ~ F ~ 20 ~ Z ~ 3R ~ 63 ~ 6Q ~ 9 ~ 14 ~ 64 ~ 6S ~ Z ~ 3S ~ I ~ 22 ~ W
~ 3M ~ 4B ~ 59 ~ -1 ~ -1 ~ Z ~ 3T ~ Z ~ 3U ~ Z ~ 3V ~ -1 ~ -1 ~ -1 ~ -1 ~ Z
~ 3W ~ Z ~ 3X ~ Z ~ 3Y ~ Z ~ 3Z ~ Z ~ 40 ~ Z ~ 41 ~ 3 ~ J ~ 3 ~ J ~ 3 ~ J ~ 3
~ J ~ 3 ~ J ~ 4H ~ 5H ~ -1 ~ -1 ~ Z ~ 42 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ Z ~ 43
~ Z ~ 44 ~ Z ~ 45 ~ -1 ~ -1 ~ Z ~ 46 ~ Z ~ 47 ~ Z ~ 48 ~ 2 ~ A ~ 2 ~ A ~ 2 ~ A
~ 2 ~ A ~ 2 ~ A ~ 3 ~ J ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ 3 ~ K ~ -1 ~ -1 ~ -1
~ -1 ~ 23 ~ 4N ~ 3 ~ L ~ 3 ~ M ~ 3 ~ N ~ 3 ~ O ~ -1 ~ -1 ~ -1 ~ -1 ~ 3 ~ P ~ -1
~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ 2 ~ A ~ 23 ~ 4O ~ -1 ~ -1 ~ -1 ~ -1 ~ 2 ~ B ~ 2 ~ C
~ -1 ~ -1 ~ -1 ~ -1 ~ 2 ~ D ~ 2 ~ E ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ 3 ~ Q ~ 23
~ 4P ~ -1 ~ -1 ~ 23 ~ 4Q ~ 23 ~ 4R ~ 23 ~ 4S ~ 23 ~ 4T ~ 4M ~ 4M ~ 4M ~ 4M ~ 4M
~ 4M ~ 4M ~ 4M ~ 4M ~ 4M ~ 4M ~ 4M ~ 4M ~ 4M ~ 4M ~ 4M ~ 4M ~ 4M ~ 4M ~ 4M ~ U
~ 3L ~ U ~ 3L ~ U ~ 3L ~ U ~ 3L ~ U ~ 3L ~ U ~ 3L ~ U ~ 3L ~ U ~ 3L ~ U ~ 3L
~ U ~ 3L ~ -1 ~ -1 ~ 3 ~ R ~ 3 ~ S ~ -1 ~ -1 ~ 3 ~ T ~ -1 ~ -1 ~ 1 ~ 5 ~ 1 ~ 5
~ 1 ~ 5 ~ 1 ~ 5 ~ 1 ~ 5 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ 15
~ 4E ~ 23 ~ 4U ~ -1 ~ -1 ~ -1 ~ -1 ~ 2 ~ F ~ 23 ~ 4V ~ -1 ~ -1 ~ -1 ~ -1 ~ 15
~ 4F ~ 23 ~ 4W ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ 1 ~ 5 ~ -1 ~ -1 ~ 23 ~ 4X ~ -1
~ -1 ~ 3 ~ U ~ 3 ~ V ~ 3 ~ W ~ 23 ~ 4Y ~ 1 ~ 6 ~ 1 ~ 7 ~ 23 ~ 4Z ~ 1E ~ 1E ~ 1E
~ 1E ~ 1E ~ 1E ~ 1E ~ 1E ~ 1E ~ 1E ~ 1E ~ 1E ~ 1E ~ 1E ~ 1E ~ 1E ~ 1E ~ 1E ~ 1E
~ 1E ~ -1 ~ -1 ~ 2 ~ G ~ 2 ~ H ~ 2 ~ I ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ 4M ~ 5L
~ 21 ~ 4M ~ 21 ~ 4M ~ 21 ~ 4M ~ 21 ~ 4M ~ 21 ~ 4M ~ 21 ~ 4M ~ 21 ~ 4M ~ 21 ~ 4M
~ 21 ~ 4M ~ 21 ~ 4M ~ 2M ~ 2M ~ 2M ~ 2M ~ 2M ~ 2M ~ 2M ~ 2M ~ 2M ~ 2M ~ 2M ~ 2M
~ 2M ~ 2M ~ 2M ~ 2M ~ 2M ~ 2M ~ 2M ~ 2M ~ 15 ~ 4G ~ -1 ~ -1 ~ 15 ~ 4H ~ -1 ~ -1
~ 15 ~ 4I ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1
~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1
~ -1 ~ -1 ~ 15 ~ 4J ~ -1 ~ -1 ~ -1 ~ -1 ~ 15 ~ 4K ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1
~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1
~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ -1 ~ 1 ~ 8
~ 1 ~ 9 base !
\ ------------------------------------------------------------------------------
\ Token values required by Regex and String Builder
\ Automatically generated by the tokens.fth file when running Grace
\ to generate one of the Regex parsers - DO NOT MANUALLY EDIT
1 constant end-of-regex
2 constant white-space
82 constant (?x)tok
84 constant (?-x)tok
86 constant (?end)tok
115 constant (?-c)tok
\ ------------------------------------------------------------------------------
\ Regex - scanner module
\ Uses a state transition table generated by LexGen
\ Simplified version of a file scanner
[defined] [rgx-dev] [if] .( Loading regexscan.fth ...) cr [then]
\ ---[ get-char from a string ]------------------------------------------------
2variable regex-str \ The regular expression being compiled
variable curr-char \ Last character read
: set-regex ( caddr u -- ) regex-str 2! ;
: get-char ( -- ch | -1 ) \ -1 is end of string
regex-str 2@ over swap ?dup ( -- caddr caddr [ u u | 0 ] )
if
1 /string regex-str 2! c@
dup curr-char ! exit ( -- ch )
then
= ( -- -1 )
;
: get-pos ( -- caddr ) regex-str cell+ @ ;
: reset-pos ( caddr -- )
regex-str 2@ >r over - r> +
over 1 chars - c@ curr-char !
regex-str 2!
;
: get-string ( -- caddr u ) postpone regex-str postpone 2@ ; immediate
: scan-to-char ( ch -- caddr u caddr2 u2 )
>r regex-str 2@ 2dup
begin
dup
while
over c@ r@ <>
while
1 /string
repeat then
r> drop
;
: parse-to-char ( ch -- caddr u )
scan-to-char tuck regex-str 2! -
;
: parse-past-char ( ch -- caddr u )
parse-to-char
regex-str 2@ dup if 1 /string regex-str 2! then
;
\ ---[ Access lex arrays ]------------------------------------------------------
1 cells constant 1cell
2 cells constant 2cells
3 cells constant 3cells
: BaseDefault ( index -- ad )
3cells * BaseDefaultData +
;
: CheckNext ( index -- ad )
2cells * CheckNextData +
;
\ These definitions are for readability in the scanner
: lexBase ; immediate \ Compiles nothing
: lexDefault postpone cell+ ; immediate
: lexToken postpone cell+ postpone cell+ ; immediate
: lexCheck ; immediate
: lexNext postpone cell+ ; immediate
\ ---[ Lex arrays, index to abs addresses ]-------------------------------------
\ Conversion of lex array data to absolute addresses. Using absolute addresses
\ is faster than using the array data to index the arrays. An alternative is
\ to replace the , in the LexTables data with a word that does the conversion
\ as the file is read
: ?invalid ( ad1 -- ad2 | 0 ) \ 0 if contents of ad1 negative
@ dup 0<
if
drop 0
else
BaseDefault
then
;
: >addresses
BaseDefaultData #states 3cells * over + swap
?do
i lexBase @ CheckNext i lexBase !
i lexDefault ?invalid i lexDefault !
3cells
+loop
CheckNextData maxCheck 2cells * over + swap
?do
i lexCheck ?invalid i lexCheck !
i lexNext ?invalid i lexNext !
2cells
+loop
maxcheck CheckNext to maxCheck
;
>addresses \ Do the conversion to absolute addresses
\ ---[ Lexical scanning ]-------------------------------------------------------
\ nextState returns 0 if there is no valid next state
\ state and state2 are addesses, ch' is a character converted to cells
\ by the caller
: next-state ( state ch' -- state2 )
tuck over 2>r ( -- ch' state ) ( R: -- ch' state )
tuck lexBase @ + ( -- state ad1 )
dup CheckNextData maxCheck
within ( -- state ad1 f ) \ f = 0 is out of range
if
dup lexCheck @ r@ = ( -- state ad1 f )
if
nip lexNext @ ( -- state2 )
2r> 2drop exit
then
then
r> 2drop lexDefault @ ( -- state3 ) ( R: -- ch )
r> over ( -- state3 ch f ) \ Should this be 0>= ??
if recurse exit then ( -- state4 )
drop ( -- 0 )
;
\ nextToken returns 0 for invalid token else valid token. In regex this
\ is treated as a single character to be matched.
\ Note when a state has a valid token, this is remembered and we carry on
\ until we can get no further. This allows a longer lexeme to be recognised
\ and backtracking (within the line) if there is none such.
\ This scanner is a modified version of the library scanner so that it can
\ handle tokens in both a regular expression and in a character class
\ using the same state transition tables. This is done by passing a leading
\ character into next-token. This character is used instead of reading the
\ first character from the regular expression.
: next-token ( ch -- caddr u token )
get-pos swap regex-str @ 0= ( -- caddr1 ch f )
if end-of-regex exit then ( -- x x token )
2cells * >r dup char+ ( -- caddr1 caddr2 )
0 BaseDefaultData r> ( -- caddr1 caddr2 tok state ch' )
begin
next-state dup ( -- caddr1 caddr2 tok state2 state2)
while
dup lexToken @ 0> ( -- caddr1 caddr2 tok state2 f )
if
>r 2drop get-pos ( -- caddr1 caddr3 )
r@ lexToken @ r> ( -- caddr1 caddr3 tok2 state2 )
then
get-char 2cells *
repeat
drop >r 2dup reset-pos - r> ( -- caddr1 u token )
;
\ -----------------------------------------------------------------------------
\ To read in tokens as used by LexGen
\ Defines a token that returns a unique value
variable tokenval 1 tokenval !
: token ( -- ) ( use: token name ) ( name: -- n )
tokenval @ constant
1 tokenval +!
;
\ ------------------------------------------------------------------------------
[defined] [rgx-dev] [if] .( regexscan.fth loaded ) .s [then]
\ ------------------------------------------------------------------------------
\ Regex - match module
[defined] [rgx-dev] [if] .( Loading regexmatch.fth ...) cr [then]
\ ---[ ASCII control codes used by the parser ]---------------------------------
base @ decimal
0 constant ^nul 7 constant ^bel 8 constant ^bs 9 constant ^ht
10 constant ^lf 11 constant ^vt 12 constant ^ff 13 constant ^cr
27 constant ^esc char } constant rbrace
base !
\ ---[ Case folding ]-----------------------------------------------------------
: >upper ( ch -- CH )
dup [ char a char z 1+ ] 2literal within
if [ char A char a - ] literal + then
;
: str>upper ( caddr u -- )
over + swap
?do i c@ >upper i c! loop
;
: >lower ( CH -- ch )
dup [ char A char Z 1+ ] 2literal within
if [ char a char A - ] literal + then
;
: str>lower ( caddr u -- )
over + swap
?do i c@ >lower i c! loop
;
0 [if]
\ Skip leading white space in a string
: -leading ( caddr u -- caddr2 u2 )
begin dup while over c@ bl 1+ u< while 1 /string repeat then
;
[then]
\ ---[ Sub-expression class ]---------------------------------------------------
\ Objects are used to hold matches to sub-expressions i.e. those within
\ capturing parentheses in a regular expression. These are held in an array
\ which holds addresses of points in the subject string at which they match the
\ open and close parentheses in the regex. Element 0 of the array is used
\ to hold the overall match.
9 constant subex-limit \ Maximum number of (..) pairs permitted
variable #subex \ Holds sub-expression number
2 cells constant subex-size \ For (caddr u)
variable subex-match \ Points to the final matching sub-expressions
0 subex-match !
variable current-subex \ Used for embedded code
0 current-subex !
variable #eolchars \ Holds number of end of line chars matched
object class
1 cells class-var sx-#new
1 cells class-var sx-#del
subex-limit 1+ subex-size * var subex \ The array
end-class SubExpression
0 SubExpression sx-#new !
0 SubExpression sx-#del !
:noname [ object :: new ] 1 over @ sx-#new +! ; SubExpression defines new
:noname 1 over @ sx-#del +! [ object :: delete ] ; SubExpression defines delete
: clear-subex ( sx -- )
subex [ subex-limit 1+ subex-size * ] literal over + swap
do 0. i 2! subex-size +loop
;
: clone-subex ( sx -- sx2 ) \ sx2 is new object containing contents of sx
subex SubExpression new tuck subex ( -- sx2 ad1 ad2 )
[ subex-limit 1+ subex-size * ] literal move
;
\ Necessary for initial open parenthesis state
: init-subex ( -- ) 0 #subex ! ;
: sx-inrange? ( n -- )
subex-limit u> abort" Index to subex array is out of range"
;
\ Do not need to check index as compilation of regular expression will
\ have detected too many pairs of (...), but leave in during development
: get-sxad ( i sx -- ad )
over sx-inrange?
subex swap subex-size * +
;
: get-sx ( i sx -- caddr u )
get-sxad 2@ tuck -
;
: get-subex ( i -- caddr u | 0 0 )
subex-match @ dup
if get-sx else and dup then
;
: get-match ( -- caddr u -1 | 0 0 0 )
0 get-subex 2dup + 0<>
;
\ get-subex[0] and set-subex[0] used to save and restore any matching
\ start address for look ahead and look behind
: get-subex[0] ( sx -- ad ) postpone subex postpone @ ; immediate
: set-subex[0] ( ad sx -- ) postpone subex postpone ! ; immediate
: merge-subex ( sx1 sx2 -- ) \ Result left in sx1
subex swap subex ( -- ad2 ad1 )
#subex @ 1+ subex-size * 0
do
over i + cell+ @
if
2dup swap i + 2@ rot i + 2!
then
subex-size
+loop
2drop
;
: ?subex-equal ( sx1 sx2 -- f )
subex swap subex ( --- ad2 ad1 )
#subex @ 1+ subex-size * 0
do
dup i + 2@ 2over drop i + 2@ d=
if subex-size else unloop 2drop false exit then
+loop
2drop true
;
: clear-subexmatch ( -- )
subex-match @ ?dup
if delete 0 subex-match ! then
;
[defined] [rgx-dev] [if]
: show-subex ( sx -- ) \ Development only
subex-match @ >r subex-match !
cr ." Sub-expressions:"
#subex @ 1+ 0
?do
cr i 0 .r [char] ) emit space
i get-subex 2dup swap . . type
loop cr
r> subex-match !
;
[then]
\ ---[ ListItem class ]----------------------------------------------------------
object class
1 cells var next-ptr
end-class ListItem
:noname ( class -- list )
[ object :: new ] ( -- list )
0 over next-ptr !
; ListItem defines new
\ ---[ State lists ]------------------------------------------------------------
\ --- State List Item (sli)
ListItem class
1 cells var pstate
1 cells var psubex
1 cells var prepstate
1 cells class-var sl-#new
1 cells class-var sl-#del
end-class StateListItem
0 StateListItem sl-#new !
0 StateListItem sl-#del !
:noname ( sx state class -- sli )
[ ListItem :: new ]
tuck pstate !
tuck psubex !
0 over prepstate !
1 over @ sl-#new +!
; StateListItem defines new
:noname ( sli -- )
1 over @ sl-#del +!
dup psubex @ ?dup if delete then
dup prepstate @ ?dup if delete then
[ ListItem :: delete ]
; StateListItem defines delete
\ --- State list header
\ Useful during development to have #items available, can be removed eventually
ListItem class
1 cells var ptail \ Points to the last item in the list
1 cells var #items \ Number of items in the list
end-class StateList
: new-statelist1 ( sli -- list )
StateList new
2dup next-ptr ! tuck ptail !
;
: new-statelist ( -- list )
StateList new
;
: clear-list ( list -- ) 0 swap 2dup next-ptr ! 2dup ptail ! #items ! ;
: append-to-list ( list sli -- )
0 over next-ptr !
over 2dup dup next-ptr @ ( -- list sli list sli list f )
if ptail @ then
next-ptr ! ptail ! ( -- list )
1 swap #items +! ( -- )
;
\ ---[ Global state object ]----------------------------------------------------
\ Used to hold the context of a regex match before starting a new match on
\ another regex or before executing embedded Forth code. This permits the user
\ to use the stack freely. Only one of these is active at any time, they are put
\ into a list, so that they can be deleted in the event of an ABORT or THROW.
\ The R stack could have been used but is less convenient.
ListItem class
2 cells var subj-str \ (caddr u) of the input text being matched
2 cells var match-pos \ Start position of current match
\ 1 cells var subex-match \ Points to subex array
8 cells var match-context \ Space for up to 8 cells
end-class MatchInfo
variable matcher \ Points to the current MatchInfo object
0 matcher !
:noname ( caddr u class -- )
[ ListItem :: new ] ( -- caddr u obj )
>r 2dup r@ subj-str 2! ( -- caddr u )
-1 /string r@ match-pos 2!
matcher @ r@ next-ptr !
r> matcher !
; MatchInfo defines new
: new-matcher ( caddr u -- ) MatchInfo new ;
: delete-matcher ( -- )
matcher @ dup next-ptr @ ( -- obj1 obj2 )
swap delete ( -- obj2 )
matcher !
;
: clear-matchers ( -- )
begin matcher @ while delete-matcher repeat
;
\ Save and restore stack in matcher object prior to look ahead and look back
: >matcher ( xn ... x1 n -- ) \ n must be even and <= 8
cells matcher @ match-context tuck + swap
do i 2! 2cells +loop
;
: matcher> ( n -- xn ... x1 ) \ n must be even and <= 8
2 - cells matcher @ match-context tuck +
do i 2@ 2cells negate +loop
;
\ Matcher field access
: subject ( -- ad ) matcher @ subj-str ;
: match-start ( -- ad ) matcher @ match-pos ;
\ ---[ Pre-defined sets ]-------------------------------------------------------
127 constant maxchar
maxchar :set char-class
\ Sets defined by \d \w etc
char-class new constant \d-set
char-class new constant \w-set
char-class new constant \s-set \ includes bl tab lf cr ff vt
char-class new constant allchars-set
\ Populate the sets - interpreter loops within the line using 0 >in !
char 0
dup char 9 1+ < [if] dup \d-set add-member 1+ 0 >in ! [then] drop
\d-set \w-set copy-set
char a
dup char z 1+ < [if] dup \w-set add-member 1+ 0 >in ! [then] drop
char A
dup char Z 1+ < [if] dup \w-set add-member 1+ 0 >in ! [then] drop
char _ \w-set add-member
bl \s-set add-member
9
dup 14 < [if] dup \s-set add-member 1+ 0 >in ! [then] drop
0
dup maxchar 1+ < [if] dup allchars-set add-member 1+ 0 >in ! [then] drop
\ ---[ NFA-State base class ]---------------------------------------------------
object class
1 cells var lastlist
1 cells var NFA-next
1 cells var NFA-x \ Data or pointer in subclasses
method match-char
method exec-state
end-class NFA-State
:noname ( x class -- 0 ad state )
[ object :: new ] ( -- x state )
dup >r NFA-x ! 0 r@ NFA-next ( -- 0 ad )
2dup ! r> ( - 0 ad state )
; NFA-State defines new
0 value match-state \ Will hold MatchState object
variable list-id 0 list-id !
variable step-id 0 step-id ! \ Prevents infinite split state loops
\ Moves a StateListItem to the end of the next list (nlist)
:noname ( nlist caddr sli state -- nlist caddr sli true )
list-id @ over lastlist !
over pstate ! 2>r ( -- nlist ) ( R: -- caddr sli )
dup r@ append-to-list 2r> true ( -- nlist caddr sli true )
; NFA-State defines exec-state
:noname ( caddr state -- false ) 2drop true ; NFA-State defines match-char
\ ---[ Character State classes ]------------------------------------------------
NFA-State class
1 cells -
1 cells var NFA-char \ Overlays NFA-x
end-class CharState
:noname ( caddr state -- f )
NFA-char @ swap c@ =
; CharState defines match-char
: find-state ( nlist state -- sli ) \ Find in next state list
swap next-ptr @ ( -- state sli )
begin
2dup pstate @ <>
while
dup 0= abort" State not found"
next-ptr @
repeat
nip
;
:noname ( nlist caddr sli state -- nlist caddr sli f )
over prepstate @ 0=
over lastlist @ list-id @ = and \ Is state already in the next list
if
2>r over r@ find-state ( -- nlist caddr sli2 ) ( R: -- sli state )
psubex @ 2r@ drop psubex @ ( -- nlist caddr sx2 sx1 )
?subex-equal ( -- nlist caddr f )
if 2r> 0= exit then ( -- nlist caddr sli 0 )
2r> ( -- nlist caddr sli state )
then
[ NFA-State :: exec-state ] ( -- nlist caddr sli f )
; CharState defines exec-state
\ --- Case insensitive single character
CharState class
end-class iCharState
:noname ( caddr state -- f )
NFA-char @ swap c@ >lower =
; iCharState defines match-char
\ ---[ Character Class State classes ]------------------------------------------
CharState class
1 cells -
1 cells var NFA-charclass \ Overlays NFA-x
end-class CharClassState
:noname ( class -- 0 ad state )
char-class new swap ( -- set class )
[ NFA-State :: new ] ( -- 0 ad state )
; CharClassState defines new
: new-CharClassState ( set -- 0 ad state ) \ For preset states \w etc
CharClassState [ NFA-State :: new ]
;
:noname ( caddr state -- f )
>r c@ r> NFA-charclass @ is-member
; CharClassState defines match-char
\ --- Negated character class [^...]
CharClassState class
end-class NegCharClassState
: new-NegCharClassState ( set -- 0 ad state ) \ For preset states \w etc
NegCharClassState [ NFA-State :: new ]
;
:noname ( caddr state -- f )
[ CharClassState :: match-char ] 0=
; NegCharClassState defines match-char
\ ---[ Match-any character State class ]----------------------------------------
\ For . (match any character but new line (LF))
CharState class
1 cells -
1 cells var dot-sflag
end-class DotState
:noname ( caddr state -- f )
dot-sflag @
if drop true else c@ ^lf <> then
; DotState defines match-char
\ ---[ Declaration of Split and Repeat State classes ]--------------------------
\ Here for the interface
NFA-State class
1 cells -
1 cells var NFA-altnext \ Overlays NFA-x
1 cells var ss-id \ To prevent infinite loops
end-class SplitState
SplitState class
1 cells var rep-max
1 cells var rep-min
end-class RepeatState
RepeatState class
1 cells var rep-count \ Number of repetitions
1 cells var rep-ref \ Reference count from prepstate sli's
method inrange-oper
end-class DoRepeatState
\ ---[ Split State class implementation ]---------------------------------------
\ Generated by | * + ? to create an extra matching path
:noname ( state1 state2 class -- state )
[ NFA-State :: new ] ( -- state1 0 ad state )
nip nip tuck NFA-next ! ( -- state )
; SplitState defines new
\ Used for greedy and lazy quantifiers
: new-splitter ( state1 state2 class -- ad state )
new dup NFA-altnext swap
;
\ The order of the use of next states below together with the insertion
\ of a list item at the tail of the next list means that, for capturing
\ parantheses, the close parenthesis always happens before the next open
\ of the same pair of parentheses, see example
\ (a|b)*a(a|b)(a|b)(a|b)(a|b)(a|b)(a|b)
\ It also has the benefit that the lazy path through split states are taken
\ first which is needed for lazy matches
: clone-sli ( sli1 state -- sli2 )
over psubex @ swap StateListItem new ( -- sli1 sli2 )
over prepstate @ over prepstate !
dup psubex @ clone-subex ( -- sli1 sli2 sx )
rot psubex ! ( -- sli2 )
;
: add-2states ( nlist caddr sli state state2 -- nlist caddr sli f )
2 pick 2 pick 2>r >r ( -- ... sli state ) ( R: -- sli state state2 )
NFA-altnext @ tuck clone-sli ( -- nlist caddr state3 sli2 )
r> over prepstate ! ( R: -- sli state )
swap exec-state ( -- nlist caddr sli2 f )
if drop else delete then
2r> NFA-next @ exec-state ( -- nlist caddr sli f ) ( R: -- )
;
: ?ssdone ( state -- state false | false true )
dup ss-id @ step-id @ =
if 0= -1 else step-id @ over ss-id ! 0 then
;
:noname ( nlist caddr sli state -- nlist caddr sli f )
?ssdone if exit then \ Already processed, so exit with f = false
over prepstate @ dup
if 1 over rep-ref +! then
add-2states
; SplitState defines exec-state
\ ---[ Repeating State class implementation ]-----------------------------------
\ For {n,m}, {n} and {n,}
:noname ( n m state1 state2 class -- ad state )
[ SplitState :: new ] ( -- n m state )
dup NFA-altnext swap ( -- n m ad state )
2>r r@ rep-max 2! ( -- )
0 r@ rep-ref ! 2r> ( -- ad state )
; RepeatState defines new
: new-dorep ( nlist caddr sli state class -- nlist caddr sli state3 )
over 2>r rep-max 2@ 1+ ( -- ... sli n m+1 )
r@ NFA-next @ r> NFA-altnext @ ( -- ... sli n m+1 state1 state2 )
r> new ( -- nlist caddr sli state3 )
2dup swap prepstate !
;
:noname ( nlist caddr sli state -- nlist caddr sli f )
DoRepeatState new-dorep
exec-state
; RepeatState defines exec-state
:noname ( n m+1 state1 state2 class -- state )
[ RepeatState :: new ] nip ( -- state )
0 over rep-count !
; DoRepeatState defines new
:noname ( state -- ) \ Only delete object when ref count is 0
dup rep-ref @ 0=
if
[ RepeatState :: delete ]
else
-1 swap rep-ref +!
then
; DoRepeatState defines delete
: ?repeat ( state -- state false | false true )
dup lastlist @ list-id @ = >r
list-id @ over lastlist ! r> ( -- state f )
if 0= -1 else ?ssdone then
;
:noname ( nlist caddr sli state -- nlist caddr sli f )
0 add-2states
; DoRepeatState defines inrange-oper
:noname ( nlist caddr sli state -- nlist caddr sli f )
?repeat if exit then ( -- nlist caddr sli [state | 0] )
dup >r rep-count @ 1 r@ rep-count +! ( nlist caddr sli ct )
dup r@ rep-min @ <
if
drop r> NFA-next @ exec-state ( -- nlist caddr sli f )
else
r@ rep-max 2@ within ( -- nlist caddr sli f2 )
if
r> inrange-oper ( -- nlist caddr sli f )
else
r> 0= ( -- nlist caddr sli 0 )
then
then
; DoRepeatState defines exec-state
\ ---[ End of Repeat state ]----------------------------------------------------
\ Always used at the end of a repeated sub-expression to ensure that separate
\ threads e.g. from (?:a|b){3,5} are joined into one so that there is a single
\ end state. It also simplifies use of prepstate. It does not use the NFA-next
\ pointer to get to the next DoRepeat state but the prepstate pointer in the
\ sli. This is because there may be multiple DoRepeat states active
\ simultaneously.
NFA-State class end-class EndRepState
:noname ( nlist caddr sli state -- nlist caddr sli f )
drop dup prepstate @ exec-state
; EndRepState defines exec-state
\ ---[ Start of string/line State class ]---------------------------------------
\ --- For ^ and \A (match start of line/text)
NFA-State class
1 cells -
1 cells var sol-mode \ 0 for \A and m mode = 0; <>0 for m mode <> 0
end-class StartState
\ 2variable subject-str \ Input string
\ : subject subject-str ; \ ********** Temporary 14/11/10
:noname ( nlist caddr sli state -- nlist caddr sli f )
>r over subject 2@ drop = ?dup 0=
if over 1 chars - c@ ^lf = r@ sol-mode @ and then
if
r> NFA-next @ exec-state
else
r> 0= ( -- nlist caddr sli 0 )
then
; StartState defines exec-state
\ ---[ End of string/line State classes ]---------------------------------------
\ --- For \z
\ Ignores m mode and doesn't match before end of line, only at end of string
NFA-State class end-class EOSState
:noname ( nlist caddr sli state -- nlist caddr sli f )
>r over subject 2@ chars + =
if
r> NFA-next @ exec-state
else
r> 0= ( -- nlist caddr sli 0 )
then
; EOSState defines exec-state
\ --- End of line state, for $ in normal mode and \Z both modes
NFA-State class end-class EOL$0\ZState
: ?lf ( caddr -- f ) c@ ^lf = ;
: ?endofline ( caddr -- n f ) \ n is number of chars remaining in string
dup >r \ f is true for end of line
subject 2@ chars + - negate dup ( -- n n )
case
0 of r> 0<> endof ( -- 0 true )
1 of r> ?lf endof ( -- 1 f )
r@ c@ ^cr =
r> char+ ?lf and swap ( -- n f n )
endcase ( -- n f )
;
:noname ( nlist caddr sli state -- nlist caddr sli f )
2>r dup ?endofline over 3 < and
if
dup #eolchars !
+ 2r> NFA-next @ exec-state
else
drop 2r> 0= ( -- nlist caddr sli 0 )
then
; EOL$0\ZState defines exec-state
\ --- End of line state, for $ in enhanced line anchor mode
NFA-State class end-class EOL$1State
variable #skipped \ Number of characters consumed during a match
\ Included for embedded end of line characters
:noname ( nlist caddr sli state -- nlist caddr sli f )
2>r dup ?endofline ( -- nlist caddr n f )
if
2 min dup #skipped +!
+ 2r> NFA-next @ exec-state
else
drop dup ?lf
if
1 #skipped +!
char+ 2r> NFA-next @ exec-state
else
2r> 0= ( -- nlist caddr sli 0 )
then
then
; EOL$1State defines exec-state
\ ---[ Parentheses State classes ]----------------------------------------------
NFA-State class
1 cells -
1 cells var par-#subex
end-class ParenState
\ --- Open parenthesis
ParenState class
end-class OpenParenState
\ Factor for parentheses states
: save-parenad ( caddr sli state offs -- caddr sli state )
2>r over #eolchars @ - ( -- caddr sli caddr' )
over r> r@ par-#subex @ ( -- caddr sli caddr' pl1 offs n )
rot psubex @ get-sxad + ! r> ( -- caddr sli state )
;
:noname ( nlist caddr sli state -- nlist caddr sli f )
0 save-parenad
NFA-next @ exec-state
; OpenParenState defines exec-state
\ --- Close parenthesis state
ParenState class
\ 1 cells var cp-name \ Holds xt of 2variable for named capture
end-class CloseParenState \ Not currently needed
:noname ( nlist caddr sli state -- nlist caddr sli f )
1 cells save-parenad
NFA-next @ exec-state ( -- nlist caddr sli f )
; CloseParenState defines exec-state
\ ---[ Word boundary State classes ]--------------------------------------------
\ For word boundary \b
NFA-State class
end-class BoundaryState
: ?boundary ( caddr -- f ) \ f is true for word boundary
>r subject 2@ drop r@ = ( -- f1 ) \ Start of line?
subject 2@ chars + r@ = or ( -- f2 ) \ End of line?
r@ c@ \w-set is-member ( -- f2 f3 )
r> -1 chars + c@ \w-set is-member ( -- f2 f3 f4 )
xor or ( -- f )
;
\ *** Note: could factor code from the next two states
:noname ( nlist caddr sli state -- nlist caddr sli f )
>r over ?boundary ( -- nlist caddr sli f ) ( R: -- state )
if
r> NFA-next @ exec-state ( -- nlist caddr sli f )
else
r> 0= ( -- nlist caddr sli 0 )
then
; BoundaryState defines exec-state
\ --- Not word boundary for \B
BoundaryState class
end-class NotBoundaryState
:noname ( nlist caddr sli state -- nlist' )
>r over ?boundary 0= ( -- nlist caddr sli f ) ( R: -- state )
if
r> NFA-next @ exec-state ( -- nlist caddr sli f )
else
r> 0= ( -- nlist caddr sli 0 )
then
; NotBoundaryState defines exec-state
\ ---[ Match State class ]------------------------------------------------------
\ Indicates a match has been found
ParenState class
end-class MatchState
0 MatchState new to match-state 2drop
\ ?longer-match is necessary when a lazy quantifier has caused a look ahead
\ to a longer solution than that just found
: ?longer-match ( sx -- f ) \ True if already have a longer match
0 get-subex nip ?dup ( -- sx u u | sx 0 )
if
0 rot get-sx nip > ( -- f )
else
0= ( -- false )
then
;
:noname ( nlist caddr sli state -- nlist caddr sli 0 )
1 cells save-parenad
>r dup psubex @ ( -- nlist caddr sli sx )
\ cr ." ---[ match ]-----------------------"
\ cr ." This match:" dup show-subex
dup ?longer-match
if
drop
else
list-id @ r@ lastlist @ = subex-match @ and
if \ Merge subex
subex-match @ swap merge-subex ( -- nlist caddr sli )
else \ Assign subex to subex-match
clear-subexmatch dup subex-match ! ( -- nlist caddr sli sx )
current-subex !
0 over psubex ! \ Ensure subex not deleted
then
then
\ ." Saved match subex"
\ subex-match @ show-subex
\ ." -----------------------------------" cr
list-id @ r@ lastlist ! r> 0= ( -- nlist caddr sli 0 )
; MatchState defines exec-state
\ ---[ Back reference State classes ]-------------------------------------------
\ Because regular expressions such as (abc)(\w)*\1 can keep re-entering the
\ back reference, the Back Reference state creates a new DoBackReference state
\ to do the comparison. Therefore there may be several of these states active
\ at any one time during matching.
SplitState class
2 cells var dbr-subex \ (caddr u) of sub-expression value
method adjust-case
end-class DoBackRefState
:noname ( caddr u state1 class -- state ) \ state1 --> NFA-altnext
[ NFA-State :: new ] nip nip dup >r ( -- caddr u state ) ( R: -- state )
dbr-subex 2! r> ( -- state )
dup dup NFA-next ! \ NFA-next points to self
; DoBackRefState defines new
:noname ( ch1 ch2 state -- ch1 ch2 ) \ Default is no-op
drop
; DoBackRefState defines adjust-case
: brefmatch ( state ch1 ch2 -- f )
= tuck ( -- f state f )
if
dup dbr-subex 2@
1 /string rot dbr-subex 2! ( -- true )
else
delete ( -- false ) \ So delete self
then
;
:noname ( caddr state -- f )
tuck >r c@ over dbr-subex 2@ ( -- state ch1 caddr2 u2 )
drop c@ r> adjust-case brefmatch ( -- f )
; DoBackRefState defines match-char
:noname ( nlist caddr sli state -- nlist caddr sli f )
?ssdone if exit then
dup >r dbr-subex @ ( -- nlist caddr sli u ) ( R: -- state )
if \ Still matching, add self to nlist
r> [ NFA-State :: exec-state ] ( -- nlist caddr sli f )
else \ Have matched the sub-expression
r@ NFA-altnext @ exec-state ( -- nlist )
r> delete \ Finished with self
then
; DoBackRefState defines exec-state
\ --- Do back reference for case insensitivity
DoBackRefState class end-class iDoBackRefState
:noname ( ch1 ch2 state -- ch2' ch1' )
drop >lower swap >lower
; iDoBackRefState defines adjust-case
\ --- Do back reference for lower-case \L ... \E
DoBackRefState class end-class lowDoBackRefState
:noname ( ch1 ch2 state -- ch1 ch2' )
drop >lower
; lowDoBackRefState defines adjust-case
\ --- Do back reference for upper case \U ... \E
DoBackRefState class end-class upDoBackRefState
:noname ( ch1 ch2 state -- ch1 ch2' )
drop >upper
; upDoBackRefState defines adjust-case
\ --- Single shot Do back reference
\ The code for this is naughty since it achieves the "single shot" effect
\ in method adjust-case by changing the objects class to that held in
\ next-NFAState. This is done by overwriting the object's class pointer
\ and so is mini-oof specific. Switching to another OO package would
\ involve changing this.
DoBackRefState class
1 cells var next-NFAState
end-class oneDoBackRefState
:noname ( caddr u state1 class2 class -- state ) \ state1 --> NFA-altnext
swap >r
[ DoBackRefState :: new ] ( -- state )
r> over next-NFAState !
; oneDoBackRefState defines new
:noname ( ch1 ch2 state -- ch1 ch2' )
dup next-NFAState @ swap !
; oneDoBackRefState defines adjust-case
\ --- Do back reference for first is lower case
oneDoBackRefState class end-class low1DoBackRefState
:noname ( ch1 ch2 state -- ch1 ch2' )
[ oneDoBackRefState :: adjust-case ]
>lower
; low1DoBackRefState defines adjust-case
\ --- Do back reference for first is upper case \u
oneDoBackRefState class end-class up1DoBackRefState
:noname ( ch1 ch2 state -- ch1 ch2' )
[ oneDoBackRefState :: adjust-case ]
>upper
; up1DoBackRefState defines adjust-case
\ --- Back Reference
\ Creates a suitable DoBackReference state
NFA-State class
1 cells -
1 cells var br-case
1 cells var br-index
end-class BackRefState
:noname ( index case class -- 0 ad state )
rot >r [ NFA-State :: new ]
r> over br-index !
; BackRefState defines new
: new-dobackref ( nlist caddr sli caddr2 u2 state -- nlist caddr sli f )
dup >r NFA-next @ over ( -- nlist caddr sli caddr2 u2 state2 u2 )
if
r> br-case @
case
0 of iDoBackRefState endof
1 of DoBackRefState endof
2 of lowDoBackRefState endof
3 of upDoBackRefState endof
4 of iDoBackRefState low1DoBackRefState endof
5 of iDoBackRefState up1DoBackRefState endof
6 of DoBackRefState low1DoBackRefState endof
7 of DoBackRefState up1DoBackRefState endof
8 of upDoBackRefState low1DoBackRefState endof
9 of lowDoBackRefState up1DoBackRefState endof
endcase
new ( -- nlist caddr sli state3 )
else
nip nip r> drop ( -- nlist caddr sli state2 )
then
exec-state ( -- nlist caddr sli f )
;
:noname ( nlist caddr sli state -- nlist caddr sli f )
dup >r br-index @ over psubex @ ( -- nlist caddr sli i sx )
get-sx r> new-dobackref ( -- nlist )
; BackRefState defines exec-state
\ --- Named reference state for \g{name}
BackRefState class
1 cells var name-xt
end-class NamedRefState
:noname ( index case xt class -- 0 ad state )
swap >r [ BackRefState :: new ] ( -- 0 ad state )
r> over name-xt ! ( -- 0 ad state )
; NamedRefState defines new
:noname ( nlist caddr sli state -- nlist caddr sli f )
dup >r br-index @
over psubex @ get-sx ( -- nlist caddr sli caddr2 u2 )
r> new-dobackref ( -- nlist caddr sli f )
; NamedRefState defines exec-state
\ ---[ Embedded code state (?{...}) ]-------------------------------------------
NFA-State class
1 cells -
1 cells var embed-xt
1 cells var is-embedded-test \ Boolean
end-class EmbeddedCode
:noname ( f xt class -- 0 ad state )
rot >r [ NFA-State :: new ] ( -- 0 ad state )
r> over is-embedded-test !
; EmbeddedCode defines new
: save-context ( clist caddr2 nlist caddr sli state x -- caddr sli )
2over 2>r subex-match @ ( clist caddr2 nlist caddr sli state x sx )
8 >matcher 2r> ( -- caddr sli )
;
: restore-context ( -- clist caddr2 nlist caddr sli state x )
8 matcher> subex-match !
;
:noname ( i*x clist caddr2 nlist caddr sli state
-- k*x clist caddr2 nlist caddr sli f)
dup >r 0 save-context ( -- i*x caddr sli ) ( R: -- state )
nip psubex @ dup
current-subex ! subex-match ! ( -- i*x )
r@ embed-xt @ execute ( -- j*x )
r> is-embedded-test @
if
>r restore-context drop ( -- k*x clist caddr2 nlist caddr sli state )
r> swap ( -- k*x clist caddr2 nlist caddr sli f2 state )
else
restore-context drop ( -- k*x clist caddr2 nlist caddr sli state )
then
NFA-next @ exec-state ( -- nlist caddr sli f )
; EmbeddedCode defines exec-state
\ ---[ Back Reference test state (?(\1)...) ]-----------------------------------
NFA-State class
1 cells -
1 cells var brt-index
end-class BackRefTest
: call-cond ( nlist caddr sli state f1 -- nlist caddr sli f2 )
0<> swap NFA-next @ exec-state
;
:noname ( nlist caddr sli state -- nlist caddr sli f )
2dup brt-index @ ( -- nlist caddr sli state sli index )
swap psubex @ get-sx ( -- nlist caddr sli state caddr2 u2 )
or call-cond ( -- nlist caddr sli f )
; BackRefTest defines exec-state
\ ---[ Named Reference test state (?(\1)...) ]----------------------------------
NFA-State class
1 cells -
1 cells var nr-xt
end-class NamedRefTest
:noname ( nlist caddr sli state -- nlist caddr sli f )
2dup nr-xt @ >body @ ( -- nlist caddr sli state sli index )
swap psubex @ get-sx
or call-cond
; NamedRefTest defines exec-state
\ ---[ Conditional State ]------------------------------------------------------
NFA-State class
1 cells -
1 cells var NFA-else \ Overlays NFA-x
end-class Conditional
\ A conditional state always follows a test state, which places an extra
\ parameter on the stack compared to other exec-state methods.
\ This could be folded into the test states for better performance
:noname ( nlist caddr sli f1 state -- nlist caddr sli f )
swap if NFA-next else NFA-else then
@ exec-state
; Conditional defines exec-state
\ ---[ Matching a regular expression ]------------------------------------------
[defined] [rgx-dev] [if]
: show-list ( list -- ) \ Development only
cr ." Items: " dup #items @ .
next-ptr @
begin
dup
while
cr ." sli: " dup 0 .r
." , subex: " dup psubex @ 0 .r
." , repstate: " dup prepstate @ 0 .r
." , state: " dup pstate @ dup 0 .r
." , state-char: " NFA-x @
dup bl 128 within if emit else drop then
dup psubex @ ?dup if show-subex then
next-ptr @
repeat
drop cr
;
: show-step-info ( list caddr -- list caddr )
cr ." ---[ Step, with character: "
dup if dup c@ dup bl 128 within if emit else drop ." none" then then
." at " dup .
." ]------------"
cr ." current list: " over show-list cr
\ cr ." ---[ step ]---------------------------------" cr
;
[then]
: step ( nlist clist caddr -- clist caddr nlist )
\ show-step-info
1 list-id +!
over swap 2swap next-ptr @ ( -- clist caddr nlist sli1 )
over clear-list
begin
dup
while
dup next-ptr @ >r >r ( -- clist caddr nlist ) ( R: -- sli2 sli1 )
over r@ pstate @ match-char ( -- clist caddr nlist f )
if \ Note nlist is used by exec-state
1 step-id +!
over char+ r> dup pstate @ ( -- clist caddr nlist caddr' sli1 state)
( R: -- sli2 )
NFA-next @
\ ." step calling exec-state" p
exec-state ( -- clist caddr nlist caddr' sli1 f )
\ ." after exec-state" p
if drop else delete then ( -- clist caddr nlist caddr' )
drop ( -- clist caddr nlist )
else
r> delete ( -- clist caddr nlist ) ( R: -- sli2 )
then
r> ( -- clist caddr nlist sli2 ) ( R: -- )
repeat
drop
\ ." ----[ List at end of step ]----------------"
\ 2 pick show-list
;
: delete-list ( list -- )
dup next-ptr @
begin
?dup
while
dup next-ptr @ swap delete
repeat
clear-list
;
: (match) ( l2 l1 -- l1 l2 f ) \ f = 0 means no match found
match-start 2@ >r ( -- l2 l1 ca ) ( R: -- u )
begin
1 #skipped !
step ( -- l1 ca l2 )
dup next-ptr @ ( -- l1 ca l2 sli )
while \ next list has states to be processed
swap r> #skipped @ /string ( -- l1 l2 ca' u' )
dup >r 0= ( -- l1 l2 ca' f ) ( R: -- u' )
until drop else nip then ( -- l1 l2 )
dup delete-list
r> drop subex-match @ ( -- sx | 0 )
;
defer save-global-state
defer restore-global-state
: start-list ( list state -- ) \ state is the first OpenParen state
SubExpression new dup clear-subex ( -- list state sx )
swap StateListItem new swap ( -- sli list )
1 over #items ! 2dup next-ptr ! ptail !
;
: init-nfa ( state1 -- state2 )
0 NFA-State new nip nip ( -- state state2 )
tuck NFA-next !
;
: regex-match ( rgx -- sx | 0 )
init-nfa >r
new-statelist new-statelist ( -- l2 l1 )
begin
1 list-id +! 0 #eolchars !
match-start @
while
dup r@ start-list
(match) 0=
while
match-start 2@ 1 /string match-start 2!
repeat then
delete delete r> delete subex-match @
;
\ V0.5 match changed to return:
\ (caddr u -1), (caddr u) is the rest of the subject string after the match
\ (caddr u 0 ), no match, (caddr u) is the subject string input to match
\ The user can use get-match to obtain the matching string
: match ( caddr u rgx -- caddr1 u1 f ) \ f TRUE for match else FALSE
clear-matchers
>r new-matcher ( -- )
clear-subexmatch 0 current-subex !
r> regex-match ( -- sx | 0 )
subex-match @ current-subex !
>r subject 2@ get-match ( -- caddr u caddr2 u2 f )
if + -rot + over - else 2drop then ( -- caddr1 u1 | caddr u )
r> 0<>
;
\ ---[ Look Around State classes ]----------------------------------------------
\ --- Positive look ahead for (?=
NFA-State class
1 cells -
1 cells var la-regex \ Points to look ahead regex
end-class LookAheadState
: get-rest ( caddr -- caddr1 u1 ) \ of subject string
subject 2@ + over -
;
: (look-ahead) ( clist caddr2 nlist caddr sli state state2
\ -- clist caddr2 nlist caddr sli state ca0 [sx | 0] )
>r over psubex @ get-subex[0]
save-context r> ( -- caddr sli state2 )
init-NFA dup >r ( -- caddr sli state3 ) ( R: -- state3 )
clone-sli 2dup ( -- caddr sli2 caddr sli2 )
psubex @ set-subex[0] ( -- caddr sli2 )
0 subex-match !
swap get-rest ( -- sli2 caddr3 u3 )
new-matcher ( -- sli2 )
0 new-statelist1 ( -- sli2 list1 )
swap new-statelist1 ( -- list1 list2 )
(match) r> delete ( -- list1 list2 sx ) ( R: -- )
>r delete delete ( -- ) ( R: -- sx )
delete-matcher
restore-context r> ( -- clist caddr2 nlist caddr sli state ca0 sx )
;
: use-lasubex ( sli state caddr sx -- sli state )
tuck set-subex[0] swap >r ( -- sli sx )
over psubex dup @ delete ! ( -- sli )
r> ( -- sli state )
;
: look-ahead ( caddr sli state -- caddr sli state ca0 [sx | 0] )
dup la-regex @ (look-ahead)
;
:noname ( nlist caddr sli state -- nlist caddr sli f )
look-ahead ?dup ( -- nlist caddr sli state ca0 [sx sx | 0] )
if
use-lasubex ( -- nlist caddr sli state )
NFA-next @ exec-state ( -- nlist caddr sli f )
else
drop 0= ( -- nlist caddr sli 0 )
then
; LookAheadState defines exec-state
\ --- Positive look ahead test state for (?(?= ... ) ... | ... )
LookAheadState class
end-class LookAheadTest
: lacall-cond ( nlist caddr sli state -- nlist caddr sli f )
nip swap NFA-next @ exec-state ( -- nlist caddr sli f )
;
:noname ( nlist caddr sli state -- nlist caddr sli f )
look-ahead 0<> lacall-cond
; LookAheadTest defines exec-state
\ --- Negative look ahead for (?!=
LookAheadState class
end-class NegLookAheadState
:noname ( nlist caddr sli state -- nlist caddr sli f )
look-ahead ?dup ( -- nlist caddr sli state ca0 [sx sx | 0] )
if
2drop 0= ( -- nlist caddr sli 0 )
else
drop NFA-next @ exec-state ( -- nlist caddr sli f )
then
; NegLookAheadState defines exec-state
\ --- Negative look ahead test state for (?(?! ... ) ... | ... )
NegLookAheadState class
end-class NegLookAheadTest
:noname ( nlist caddr sli state -- nlist caddr sli f )
look-ahead 0= lacall-cond
; NegLookAheadTest defines exec-state
\ --- Look behind for (?< and (?!<
LookAheadState class
1 cells var lb-neg \ True for negative look behind
end-class LookBackState
:noname ( f1 f2 class -- 0 ad state )
rot >r [ LookAheadState :: new ]
r> over lb-neg !
; LookBackState defines new
: ?use-subex ( sx sli -- )
over
if
psubex @ dup get-subex[0] >r ( -- sx sx2 )
2dup swap merge-subex
r> over set-subex[0]
over delete
then
2drop
;
: (lookback) ( clist caddr2 nlist caddr sli state
\ -- clist caddr2 nlist caddr sli sx2 sli state f )
dup la-regex @ >r #eolchars @ ( R: -- state2 )
save-context drop ( -- caddr )
\ list-id @ >r \ is desirable see issue 20
0 subex-match !
subject 2@ drop tuck - ( -- caddr3 u3 )
new-matcher ( -- )
r> regex-match >r ( R: -- sx2 | 0 )
delete-matcher
restore-context #eolchars ! ( -- clist ca2 nlist caddr sli state )
r> swap >r ( -- clist ca2 nlist caddr sli sx2 )
2dup ( -- clist ca2 nlist caddr sli sx2 sli sx2 )
0<> r> tuck lb-neg @ xor ( -- clist ca2 nlist caddr sli sx2 sli state f)
\ r> list-id ! \ see above
;
:noname ( nlist caddr sli state -- nlist caddr sli f )
(lookback)
2swap ?use-subex ( -- nlist caddr sli state f )
if
NFA-next @ exec-state ( -- nlist caddr sli f )
else
0= ( -- nlist caddr sli 0 )
then
; LookBackState defines exec-state
\ --- Look behind test for (?(?<...)...|...) and (?(?!<...)...|...)
LookBackState class
end-class LookBackTest
:noname ( nlist caddr sli state -- nlist caddr sli f )
(lookback) 2swap 2drop ( -- nlist caddr sli state f )
swap NFA-next @ exec-state
; LookBackTest defines exec-state
\ --- Atomic group state classes (independent subexpression) for (?>...)
\ Achieved by looking ahead for a match, if so create a repeating state to
\ count the number of characters in the match. When the count is reached
\ call the state following the atomic group
NFA-State class
1 cells -
1 cells var ar-next \ Overlays NFA-x
1 cells var ar-count
end-class AtomicRepState
:noname ( u state class -- state2 )
[ NFA-State :: new ] ( -- u 0 ad state2 )
tuck swap ! nip ( -- u state2 ) \ self to NFA-next
tuck ar-count ! ( -- state2 )
; AtomicRepState defines new
:noname ( caddr state -- true )
-1 swap ar-count +! 0<>
; AtomicRepState defines match-char
:noname ( nlist caddr sli state -- nlist caddr sli f )
dup ar-count @
if
[ NFA-State :: exec-state ] \ Append self to nlist
else
dup >r ar-next @ exec-state \ Count expired, continue
r> delete
then
; AtomicRepState defines exec-state
LookAheadState class
end-class AtomicState
:noname ( nlist caddr sli state -- nlist caddr sli f )
look-ahead ?dup ( -- nlist caddr sli state ca0 [sx sx | 0] )
if
0 over get-sx nip >r ( R: -- u ) \ u = number of chars matched
use-lasubex NFA-next @ r> ( -- nlist caddr sli state1 u )
tuck 0> ( -- nlist caddr sli u state1 f1 )
if
AtomicRepState new ( -- nlist caddr sli state2 )
[ NFA-State :: exec-state ] ( -- nlist caddr sli true )
else
nip exec-state
then
else
drop 0= ( -- nlist caddr sli 0 )
then
; AtomicState defines exec-state
\ ---[ Split state for lazy quantifiers *? +? ??
\ Lazy quantifiers are handled by choosing the laziest next state option from
\ the split state to call lookahead. If this returns a match, it's done;
\ otherwise use the NFA-next pointer to look ahead
\ *** Quantifier ??
SplitState class
end-class Lazy?State
: try-lazy ( sli state caddr sx -- nlist caddr sli state ca0 [sx 0 | -1] )
dup NFA-altnext @
(look-ahead) ?dup 0= ( -- nlist caddr sli state ca0 [sx 0 | -1] )
;
: try-greedy ( nlist caddr sli state -- nlist caddr sli state ca0 [sx 0 | -1] )
dup NFA-next @
(look-ahead) ?dup 0= ( -- nlist caddr sli state ca0 [sx 0 | -1] )
;
: la-failed ( -- caddr sli state x caddr1 -- caddr1 sli 0 )
nip 2swap nip rot 0=
;
: save-lasubex ( sli state caddr sx -- sli state u ) \ u is look ahead length
0 over get-sx nip >r ( R: -- u )
use-lasubex r> ( -- sli state u )
;
: bump-caddr ( caddr sli state u -- caddr+u sli state )
-rot 2>r + 2r>
;
: save-lazy ( sli state caddr sx -- sli true )
use-lasubex ( -- sli state )
over psubex @ subex-match !
0<> ( -- sli true )
;
:noname ( nlist caddr sli state -- nlist caddr sli f )
try-lazy ( -- nlist caddr sli state ca0 [sx 0 | -1] )
if
drop 2 pick >r ( -- nlist caddr sli state ) ( R: -- caddr )
try-greedy ( -- nlist caddr sli state ca0 [sx 0 | -1] )
if r> la-failed exit then ( -- nlist caddr sli false )
save-lasubex bump-caddr ( -- nlist caddr' sli state )
try-lazy ( -- nlist caddr' sli state ca0 [sx 0 | -1] )
if r> la-failed exit then ( -- nlist caddr sli false )
r> drop
then
save-lazy ( -- nlist caddr sli true )
; Lazy?State defines exec-state
\ *** Quantifier +?
SplitState class
end-class Lazy+State
:noname ( nlist caddr sli state -- nlist caddr sli f )
2 pick >r dup ( R: -- caddr ) \ For restoration on failure
begin
drop try-greedy ( -- nlist caddr sli state ca0 [sx 0 | -1] )
if r> la-failed exit then ( -- nlist caddr sli false )
save-lasubex bump-caddr ( -- nlist caddr' sli state )
try-lazy 0= ( -- nlist caddr' sli state ca0 [sx -1 | 0] )
until
save-lazy r> drop ( -- nlist caddr sli true )
; Lazy+State defines exec-state
\ *** Quantifier *?
Lazy+State class
end-class Lazy*State
:noname ( nlist caddr sli state -- nlist caddr sli f )
try-lazy ( -- nlist caddr sli state ca0 [sx 0 | -1] )
if
drop [ Lazy+State :: exec-state ]
else
save-lazy ( -- nlist caddr sli true )
then
; Lazy*State defines exec-state
\ *** Quantifiers {n,m}? {n,}? and {n}?
RepeatState class
1 cells var lzrep-next \ Points to greedier next state
end-class LazyRepState
:noname ( n m state1 state2 class -- ad state )
[ RepeatState :: new ]
dup NFA-next @ over lzrep-next !
; LazyRepState defines new
DoRepeatState class
method adjust-reps \ Adjust min, max and count for re-entry
end-class LazyDoRepState
:noname ( nlist caddr sli state -- nlist caddr sli f )
2 pick over 2>r ( R: -- caddr state )
LazyDoRepState new-dorep ( -- nlist caddr sli state2 )
match-state over NFA-altnext !
r@ NFA-next ! r@ dup \ Greedier link points to LazyDoRepState object
begin
drop 2dup NFA-next @ ( -- nlist caddr sli state sli state2 )
1 over rep-ref +! \ Prevent deletion of state2 in look ahead
dup adjust-reps
swap prepstate !
try-greedy
if
-1 r@ NFA-next @ rep-ref +! \ To enable state2 deletion
r@ lzrep-next @ r> NFA-next ! \ Restore the greedier link
r> la-failed exit
then
save-lasubex bump-caddr ( -- nlist caddr' sli state )
0 2 pick prepstate ! \ To avoid copy when sli cloned in try-lazy
try-lazy 0=
until
save-lazy -1 r@ NFA-next @ rep-ref +! \ To enable state2 deletion
r@ lzrep-next @ r> NFA-next !
r> drop
; LazyRepState defines exec-state
:noname ( nlist caddr sli state -- nlist caddr sli f )
NFA-altnext @ exec-state
; LazyDoRepState defines inrange-oper
:noname ( state -- )
dup rep-count @ ?dup
if
1- >r 1 over rep-min !
dup rep-max dup @ r> - swap !
0 over rep-count !
then
drop
; LazyDoRepState defines adjust-reps
\ ------------------------------------------------------------------------------
[defined] [rgx-dev] [if] .( regexmatch.fth loaded ) .s [then]
\ ------------------------------------------------------------------------------
\ RegEx - the regular expression compiler module
[defined] [rgx-dev] [if] .( Loading regexcompiler.fth ...) cr [then]
\ --- Mode modifier flags
\ Implemented as a bit vector of flags
\ Note that bits 8-10 are used in the String Builder
\ Bit
\ 7 6 5 4 3 2 1 0
\ ~~~~~~~~~~~~~~~
\ x x x 0 0 x x 0 (?-i) case sensitive
\ x x x 0 0 x x 1 (?i) case insensitive
\ x x x 0 1 x 0 x \L lower case until \E
\ x x x 0 1 x 1 x \U upper case until \E
\ x x x 1 0 0 x 0 \l next char lower case, the rest case sensitive
\ x x x 1 0 0 x 1 \l next char lower case, the rest case insensitive
\ x x x 1 0 1 x 0 \u next char upper case, the rest case sensitive
\ x x x 1 0 1 x 1 \u next char upper case, the rest case insensitive
\ x x x 1 1 0 0 x \l\L or \L\l same as \L until \E
\ x x x 1 1 0 1 x \l\U or \U\l until \E
\ x x x 1 1 1 0 x \u\L or \L\u until \E
\ x x x 1 1 1 1 x \u\U or \U\u same as \U until \E
\ x x 0 x x x x x (?-m) off - enhanced line-anchor match mode
\ x x 1 x x x x x (?m) on - " " " " "
\ x 0 x x x x x x (?-s) off - dot-matches-all mode
\ x 1 x x x x x x (?s) on - " " " "
\ 0 x x x x x x x (?-x) off - free-spacing and comment regex mode
\ 1 x x x x x x x (?x) on - " " " " " "
base @ decimal
1 constant i-flag
2 constant U-flag
4 constant l-flag
8 constant UL-flag
16 constant lu-flag
32 constant m-flag
64 constant s-flag
128 constant x-flag
base !
UL-flag lu-flag or dup 2 rshift constant Ul-mask
i-flag or U-flag or l-flag or constant case-mask
variable modifiers 0 modifiers !
variable default-remods 0 default-remods !
variable default-sbmods 0 default-sbmods !
: save-flag ( f bit ad -- )
>r swap 0= 0= over and swap ( -- [0|bit] bit )
invert r@ @ and or r> ! ( -- )
;
: save-default ( f bit -- ) default-remods save-flag ;
: save-idefault ( f -- ) i-flag save-default ;
: save-mdefault ( f -- ) m-flag save-default ;
: save-sdefault ( f -- ) s-flag save-default ;
: save-xdefault ( f -- ) x-flag save-default ;
: save-mod ( f bit -- ) modifiers save-flag ;
: save-mflag ( f -- ) m-flag save-mod ;
: save-sflag ( f -- ) s-flag save-mod ;
: save-xflag ( f -- ) x-flag save-mod ;
: save-iflag ( f -- )
dup
if
modifiers @ case-mask invert and \ Clear all case flags
modifiers !
then
i-flag save-mod
;
: save-uLflag ( -- ) 0 U-flag save-mod 1 UL-flag save-mod ;
: save-uUflag ( -- ) 1 U-flag save-mod 1 UL-flag save-mod ;
: save-llflag ( -- ) 0 l-flag save-mod 1 lu-flag save-mod ;
: save-luflag ( -- ) 1 l-flag save-mod 1 lu-flag save-mod ;
: get-mod ( bit -- f ) modifiers @ and 0<> ;
: get-iflag ( -- f ) i-flag get-mod ;
: get-mflag ( -- f ) m-flag get-mod ;
: get-sflag ( -- f ) s-flag get-mod ;
: get-xflag ( -- f ) x-flag get-mod ;
: clear-lu-flag ( -- ) 0 lu-flag save-mod ;
: clear-caseflags ( -- )
[ U-flag l-flag or UL-flag or lu-flag or invert ] literal
modifiers @ and modifiers !
;
: init-remodifiers default-remods @ modifiers ! ;
: init-sbmodifiers default-sbmods @ modifiers ! ;
: get-modifiers ( -- u ) \ Clears the lu-flag prior to stacking in the parser
modifiers @ [ lu-flag invert ] literal and
;
\ Map from bit vector to consecutive integers to ease further processing
\ 0 case insensitive, 1 case sensitive, 2 \L, 3 \U,
\ 4 \l + case insens, 5 \u + case insens, 6 \l + case sens, 7 \u + case sens
\ 8 \l\U first lower rest upper, 9 \u\L first upper rest lower
base @ decimal
create case-map
1 c, 0 c, 1 c, 0 c, 1 c, 0 c, 1 c, 0 c, 2 c, 2 c, 3 c, 3 c, 2 c, 2 c, 3 c, 3 c,
6 c, 4 c, 6 c, 4 c, 7 c, 5 c, 7 c, 5 c, 2 c, 2 c, 8 c, 8 c, 9 c, 9 c, 3 c, 3 c,
base !
: get-case ( -- u )
modifiers @ case-mask and ( -- flags ) \ in range 0..31
case-map + c@
;
\ ---[ Interface to the scanner ]-----------------------------------------------
\ ---[ Shared variables ]-------------------------------------------------------
variable sym
2variable symname
2variable regex-source
\ ---[ Token filter and line refill ]-------------------------------------------
\ Characters to start different tokens
char _ constant re-token \ For regex tokens
char ` constant cc-token \ For character class tokens
char ^ constant sb-token \ For string builder tokens
char ] constant sk-token \ For string builder skip tokens
variable regex-mode \ Hold one of the 4 constants above
0 value source-flag
end-of-regex constant end-of-line
: skip-whitespace ( caddr u tok -- caddr2 u2 tok2 )
begin dup white-space = while drop 2drop regex-mode @ next-token repeat
;
: get-token ( -- caddr u tok )
begin
regex-mode @ next-token
get-xflag
while
skip-whitespace source-flag
while
2 pick c@ [char] # =
over end-of-line = or
while
drop 2drop
refill 0= abort" Regex: unexpected end of file"
source 2dup regex-source 2! set-regex
repeat then then
\ cr dup . >r 2dup type r>
;
\ ---[ Interface to the regex parser ]------------------------------------------
\ Values to be loaded by the parser
0 value first-set
0 value bytes/set
0 value bits/cell
0 value parser-name
\ Transfer parser-name and first-set to these after regexparser.fth is loaded
0 value regex-parser-name
0 value parser-first-set
\ testsym? revised in version 0.8 to enable operation on 64-bit forths
\ and other word widths > 32 bits
8 constant bits/au \ number of set bits per address unit
: (testsym?) ( set-index ad -- f ) \ ad is first-set
swap bytes/set chars * + ( -- ad1 )
sym @ bits/au /mod chars rot + c@ ( -- bit vec )
1 rot lshift and ( -- f )
;
: testsym? ( set-index -- f ) parser-first-set (testsym?) ;
: test-token ( n -- f ) sym @ = ;
: report-error ( -- )
regex-source 2@ ( -- caddr u )
cr over swap type cr ( -- caddr )
regex-str 2@ drop ( -- caddr caddr2 )
swap - spaces ." ^ "
-1 abort" syntax error"
;
: nextsym ( -- ) get-token sym ! symname 2! ;
: ?nextsym ( f -- )
0= if report-error then
nextsym
;
\ ---[ Compilation of regular expressions ]-------------------------------------
\ Characters
\ Map the case integer to a smaller set to reduce repetition in litchar
create map-case2
0 c, 1 c, 2 c, 3 c, 2 c, 3 c, 2 c, 3 c, 2 c, 3 c, align
: litchar ( ch -- 0 ad state )
get-case clear-lu-flag map-case2 + c@
case 0 of >lower iCharState endof
1 of CharState endof
2 of >lower CharState endof
3 of >upper CharState endof
endcase
new ( -- 0 ad state )
;
\ Concatenation
: (<.>) ( 0 ad1 [ad2] state2 -- 0 state2 ) \ Save state2 at ad1 and ad2
over if swap >r recurse dup r> ! then
;
: <.> ( 0 ad1 [ad2] state1 0 ad3 [ad4] state2 -- 0 ad3 [ad4] state1 )
swap ?dup
if
>r recurse r> swap ( -- 0 ad3 [ad4] state1 )
else
swap >r (<.>) drop r> ( -- 0 state1 )
then
;
\ Alternation
: <|> ( 0 ad1 [ad2] state1 0 ad3 [ad4] state2 -- 0 ad1 [ad2] ad3 [ad4] state3 )
swap ?dup
if
>r recurse r> swap ( -- 0 ad1 [ad2] ad3 [ad4] state3 )
else
SplitState new-splitter nip ( state1 state2 -- state3 )
then
;
\ --- Greedy Quantifiers ? * +
\ ? zero or one
: <?> ( 0 ad1 [ad2...adn] state1 -- 0 ad1 [ad2...adn] ad state2 )
0 SplitState new-splitter
;
\ * zero or more
\ *** Note *** should be able to use a common recurser for <*> <+>
\ using an xt e.g.
\ : compile-quantifier ( 0 ad1 [ad2] state1 xt -- 0 ad state | 0 ad state1 state2 )
\ 2 pick
\ if
\ rot >r recurse dup r> !
\ else
\ execute
\ then
\ ;
\ Try it when everything is tested
: <*> ( 0 ad1 [ad2...adn] state1 -- 0 ad state )
over
if
swap >r recurse dup r> !
else
<?> ( -- 0 ad state )
then
;
\ + one or more
: (<+>) ( 0 ad1 [ad2...adn] state1 -- 0 ad state1 state2 )
over
if
swap >r recurse dup r> ! ( -- 0 ad state1 state2 )
else
dup <?> rot swap ( -- 0 ad state1 state2 )
then
;
: <+> ( 0 ad1 [ad2...adn] state1 -- 0 ad state1 ) (<+>) drop ;
\ --- Lazy quantifiers ?? *? +?
: (<??>) ( 0 ad1 [ad2...adn] state1 class -- 0 ad state )
2>r
begin dup while match-state swap ! repeat
2r> 0 swap new-splitter ( -- 0 ad state )
;
: <??> ( 0 ad1 [ad2...adn] state1 -- 0 ad state ) Lazy?State (<??>) ;
: <*?> ( 0 ad1 [ad2...adn] state1 -- 0 ad state ) Lazy*State (<??>) ;
: <+?> ( 0 ad1 [ad2...adn] state1 -- 0 ad state ) Lazy+State (<??>) ;
\ ---[ End of pattern compilation ]---------------------------------------------
\ The 0 was inserted by regex to indicate whether parsing has left two e's on
\ the stack, if not the regular expression was empty
: ?rgx ( x 0 x1 ... xn 1 -- x 0 x1 ... xn x )
dup pick if 1+ recurse else pick then
;
: ?<.> ( 0 0 ad1 state1 0 ad2 [ad3] state2
| 0 0 ad1 state1 -- 0 0 ad state )
1 ?rgx if <.> then
;
: (end-regex) ( 0 0 ad1 state1 0 ad2 [ad3] state2
\ | 0 0 ad1 state1 -- state1 )
1 ?rgx if <.> then ( -- 0 0 ad1 state1 | 0 0 ad2 [ad3] state1 )
0 -1 match-state <.> ( -- state1 )
nip nip nip
;
: end-regex ( 0 0 ad1 state1 0 ad2 [ad3] state2 | 0 0 ad1 state1 -- state )
(end-regex) \ nip nip nip
0 save-xflag
;
\ ---[ Actions embedded in the grammar and helpers ]----------------------------
: (open-charclass) ( class -- 0 ad state )
new cc-token regex-mode !
0 save-xflag \ free form x mode not allowed in character class
; \ x flag is saved and restored in bnf grammar
: open-charclass ( -- 0 ad state ) \ For [
CharClassState (open-charclass)
;
: open-negcharclass ( -- 0 ad state ) \ For [^
NegCharClassState (open-charclass)
;
: [+] ( ch state -- )
NFA-charclass @ swap get-case map-case2 + c@
case 0 of >upper 2dup >lower swap add-member endof
\ 1 of do nothing endof
2 of >lower endof
3 of >upper endof
endcase
swap add-member
;
[defined] [test] [if]
: [-] ( ch state -- ) NFA-charclass @ drop-member ;
: [..] ( ch1 ch2 state -- )
NFA-charclass @ -rot 1+ swap ( -- set c2+1 c1 )
do i over add-member loop drop
;
[then]
variable first-char \ For first char in a character class range
: add-to-class ( state char -- state )
dup first-char ! over [+]
;
: add-char ( state -- state ) \ To a character class
curr-char @ add-to-class
;
: add-range ( state -- state ) \ To a character class
first-char @ curr-char @ ( -- state ch1 ch2 )
2dup > if report-error then
1+ swap do i over [+] loop
;
: add-set-to-class ( state set -- state )
over NFA-charclass @ ( -- state set set2 )
union ( -- state )
;
\ add-allbut-to-class is used for \D \W \S to OR the wanted bits into the
\ character class. Note that Perl [12\D\W] and [12\W\D] will both match the
\ string 12&*abc which shows that wanted characters are ORed in rather than
\ unwanted characters being excluded
: add-allbut-to-class ( state set -- state )
char-class new dup >r ( -- state set set2 )
allchars-set over copy-set
drop-members ( -- state )
r@ over NFA-charclass @ ( -- state set2 set3 )
union r> delete ( -- state )
;
: close-charclass ( -- ) \ For ]
re-token regex-mode !
clear-lu-flag
;
: new-char-class ( set -- 0 ad state ) \ For \d \w \s
new-CharClassState
;
: new-negchar-class ( set -- 0 ad state ) \ For \D \W \S
new-NegCharClassState
;
: new-dot ( -- 0 ad state ) get-sflag DotState new ; \ For .
: textstart ( u -- 0 ad state ) StartState new ; \ For ^ and \A
: new-textstart ( -- 0 ad state ) get-mflag textstart ; \ For ^
: new-EOS ( -- 0 ad state ) \ For \z anchor
0 EOSState new
;
: new-$mode0 ( -- 0 ad state ) \ For normal $ and \Z
0 EOL$0\ZState new
;
: new-EOL ( -- 0 ad state ) \ For $ enhanced mode
get-mflag
if 0 EOL$1State new else new-$mode0 then
;
: new-boundary ( -- 0 ad state ) \ For \b
0 BoundaryState new
;
: new-notboundary ( -- 0 ad state ) \ For \B
0 NotBoundaryState new
;
: open-paren ( -- 0 ad state state ) \ For capturing parentheses
1 #subex +! #subex @ dup subex-limit >=
if ." Too many ('s increase subex-limit" report-error then
OpenParenState new dup
;
: close-capparen ( state xt -- 0 ad state2 )
>r par-#subex @ r@
if dup r@ >body ! then
r> drop CloseParenState new
;
: close-paren ( state -- 0 ad state2 ) 0 close-capparen ;
\ Buffer to hold a name as a counted string for FIND
32 constant max-namesize
create nbuf max-namesize 1+ chars allot align
: find-xt ( caddr u -- caddr 0 | xt <>0 )
dup max-namesize >
if ." Name too long" report-error then
dup nbuf c! nbuf char+ swap cmove ( -- )
nbuf find
;
: get-code/xt ( ch -- caddr 0 | xt <>0 )
parse-past-char ( -- caddr u )
find-xt
;
: ?refname ( xt -- xt )
dup >body cell+ @ NamedRefState <>
if ." Invalid reference name" report-error then
;
: ?name-found ( f -- )
0= if ." Name not found" report-error then
;
: get-capname ( -- xt ) \ Named capture, xt of 2variable
[char] > get-code/xt
?name-found ?refname ( -- xt )
;
\ Does not need to check for + - or digits as the scanner has filtered out
\ non-digits
: >decnumber ( ud1 caddr1 u1 -- ud2 caddr2 u2 ) \ Decimal conversion
base @ >r decimal
>number
r> base !
;
: check-refnum ( u -- )
#subex @ >
if \ Report error and abort
." Back reference number too large" report-error
then
;
: sym>num ( -- u )
0 0 symname 2@ 1 /string ( -- ud1 caddr u )
>decnumber 2drop drop ( -- u )
;
: get-brnum ( -- u )
sym>num dup check-refnum
;
: new-backref ( -- 0 ad state ) \ For \1 etc
get-brnum ( -- u )
get-case BackRefState new ( -- 0 ad state )
clear-lu-flag
;
: new-backreftest ( -- 0 ad state )
get-brnum BackRefTest new
;
: refname ( -- )
create 0 , NamedRefState , \ index class
does> ( -- caddr u | 0 0 )
@ current-subex @ dup
if get-sx else and dup then
;
: ?name-found ( xt <>0 -- xt ) \ or abort
0= if ." Named reference: name not found" report-error then
;
: get-refname ( -- xt )
[char] } get-code/xt ( -- caddr 0 | xt <>0 )
?name-found
;
: new-namedref ( -- 0 ad state ) \ For \g{
get-refname ?refname ( -- xt )
dup >body @ get-case rot ( -- index case xt )
NamedRefState new ( -- 0 ad state )
clear-lu-flag
;
: name/num-test ( -- 0 ad state )
[char] ) parse-to-char 0 0 ( -- caddr u d )
2over >decnumber ?dup ( -- caddr u d2 caddr2 u2 u2 | 0 )
if \ Not fully converted, may be a name
2drop 2drop
find-xt ?name-found ?refname ( -- xt )
NamedRefTest
else
2drop nip nip dup check-refnum
BackRefTest
then
new ( -- 0 ad state )
;
: look-regex ( [f] class -- 0 ad1 state1 0 0 ad2 state2 ) \ f for look behind
0 swap new ( -- 0 ad1 state1 )
0 0 OpenParenState new ( -- 0 ad1 state1 0 0 ad state )
;
: notyet ( -- )
." Feature not yet implemented" report-error
;
: new-lookback ( f1 f2 -- 0 ad1 state1 0 0 ad2 state2 ) \ (?<= (?<! (?(?< (?(?<!
swap if LookBackState else LookBackTest then look-regex
;
: new-lookahead ( f -- 0 ad1 state1 0 0 ad2 state2 ) \ For (?= (?(?=
if LookAheadState else LookAheadTest then look-regex
;
: new-neglookahead ( f -- 0 ad1 state1 0 0 ad2 state2 ) \ For (?! (?(?!
if NegLookAheadState else NegLookAheadTest then look-regex
;
: end-look ( 0 ad state 0 0 ad1 state1 [0 ad2 state2] -- 0 ad state )
(end-regex) ( -- 0 ad state state3 )
over la-regex ! ( -- 0 ad state )
;
: new-atomic ( -- 0 ad1 state1 0 0 ad2 state2 ) \ For (?>
AtomicState look-regex
;
\ For possessive quantifiers
: make-atomic ( 0 ad1 state1 -- 0 ad2 state2 0 0 ad3 state3 0 ad1 state1 )
?dup if >r recurse r> else new-atomic 0 then
;
\ To handle the quantifier {n,m}. Note that n and m will be >= 0 otherwise
\ new-interval is not called e.g. {-2,3} is not recognised as an interval by
\ the scanner but as separate characters '{', '-' etc
: get-{int} ( caddr u -- ud caddr' u' )
1 /string 0 0 2swap >decnumber
;
: ((new-interval)) ( 0 ad1 state1 class n m -- 0 ad2 state2 )
rot >r 2>r 0 EndRepState new <.> ( -- 0 ad state )
nip 2r> rot ( -- 0 n m state )
0 r> new ( -- 0 ad2 state2 )
;
: (new-interval) ( 0 ad1 state1 class -- 0 ad2 state2 ) \ For {n,m}
symname 2@ get-{int} get-{int} ( -- 0 ad1 state1 class ud1 ud2 caddr u )
2drop drop nip ( -- 0 ad1 state1 class n m )
2dup u>
if ." n must be <= m" report-error then
((new-interval)) ( -- 0 ad2 state2 )
;
: new-interval ( 0 ad1 state1 -- 0 ad2 state2 ) \ For {n,m}
RepeatState (new-interval)
;
: get-{1int} ( -- n ) symname 2@ get-{int} 2drop drop ;
: new-repn+ ( 0 ad1 state1 class -- 0 ad1 state2 )
get-{1int} -1 ((new-interval)) \ -1 as future comparisons are unsigned
;
: new-atleastn ( 0 ad1 state1 -- 0 ad1 state2 ) \ For {n,}
RepeatState new-repn+
;
: new-repn ( 0 ad1 state1 class -- 0 ad1 state2 )
get-{1int} dup ((new-interval))
;
: new-exactlyn ( 0 ad1 state1 -- 0 ad1 state2 ) \ For {n}
RepeatState new-repn
;
\ For lazy repeat quantifiers {n,m}? {n,}? and {n}?
: new-lazy-interval ( 0 ad1 state1 -- 0 ad2 state2 )
LazyRepState (new-interval)
;
: new-lazyn+ ( 0 ad1 state1 -- 0 ad1 state2 ) \ For {n,}?
LazyRepState new-repn+
;
: new-lazyn ( 0 ad1 state1 -- 0 ad1 state2 ) \ For {n}?
LazyRepState new-repn
;
: get-controlchar ( -- u ) \ 0 <= u < 32
get-char dup 0< ( -- ch f )
if ." Character expected after \c" report-error then
bl mod
;
: get-xnum ( u1 -- u2 )
base @ >r hex
>r 0 0 get-string r> min ( -- ud caddr1 u1 )
>number drop reset-pos drop ( -- u2 )
r> base !
;
: get-smallxnum ( -- u ) 2 get-xnum ; \ For \x
: get-bigxnum ( -- u ) get-string drop get-xnum ; \ For (?{...})
: spaces? ( caddr u -- f ) \ Return true if string contains a space
begin
dup
while
over c@ bl <>
while
1 /string
repeat then
nip 0<>
;
:noname ; constant noop
: (embed-code) ( -- xt )
[char] } parse-past-char ( -- caddr u )
?dup 0= if drop noop exit then
2dup spaces? ( -- caddr u f )
if \ multiple forth words, so compile them
2>r :noname 2r> ( -- xt colon-sys? caddr u )
evaluate postpone ; ( -- xt )
else \ single word, find and save its xt
find-xt 0= ( -- caddr -1 | xt2 0 )
if ." No such name" report-error then \ Aborts
then
;
: embed-code ( -- 0 ad state ) \ For (?{...}
0 (embed-code) EmbeddedCode new ( -- 0 ad state )
;
\ For embedded forth code conditional
: embed-test ( -- 0 ad state ) -1 (embed-code) EmbeddedCode new ;
: new-conditional ( -- 0 ad state ad2 ) 0 Conditional new dup NFA-altnext ;
: drop-zero ( x 0 x1 ... xn 1 -- x x1 ... xn )
dup pick if 1+ recurse else 1- roll drop then
;
: ?null<.> ( 0 ad1 state1 0 -- 0 ad1 state1 )
( 0 ad1 state1 0 ad2 state2 -- 0 ad2 state1 )
?dup if 1 drop-zero <.> then
;
: roll-state ( state 0 ad1 ... adn -- ad1 ... adn state ) \ n >= 0
?dup if >r recurse r> swap then
;
: else<.> ( state1 0 ad -- ad state1 )
( state1 0 0 ad1 [...adn] state2 ad -- ad1 [...adn] state1 )
over if ! 1 drop-zero then \ Patch conditional state altnext
roll-state
;
\ get-switch returns true for (?i etc, false for (?-i etc
: (get-switch) ( caddr -- f ) 2 chars + c@ [char] - <> ;
: get-switch ( -- f ) symname 2@ drop (get-switch) ;
\ ---[ To parse and compile a regular expression ]------------------------------
: (regex) ( caddr u -- state ) \ (caddr u) is the regular expression
init-subex re-token regex-mode !
init-remodifiers 0 save-xflag
2dup regex-source 2! set-regex
0 0 OpenParenState new ( -- 0 0 ad state )
nextsym regex-parser-name execute
;
: regex$ ( caddr u -- state )
save-global-state >r
0 to source-flag (regex)
r> restore-global-state
;
: parse-regex ( char "ccc<char>" -- state )
parse regex$
;
: regex ( -- state )
save-global-state >r
source >in @ /string -1 to source-flag
(regex)
source nip regex-str @ - >in !
r> restore-global-state
;
\ ------------------------------------------------------------------------------
[defined] [rgx-dev] [if] .( regexcompiler.fth loaded ) .s [then]
\ Parser generated by Grace
\ See http://www.qlikz.org/forth/grace/grace.html
16 to bytes/set
32 to bits/cell
: <savemod> 0 testsym? if 1 testsym? if save-iflag 70 test-token if
nextsym else 72 test-token ?nextsym then else save-mflag 74
test-token if nextsym else 76 test-token ?nextsym then then else
2 testsym? if save-sflag 78 test-token if nextsym else 80
test-token ?nextsym then else save-xflag 82 test-token if nextsym
else 84 test-token ?nextsym then then then ; : <savemodspan> 3 testsym? if 4
testsym? if save-iflag 71 test-token if nextsym else 73
test-token ?nextsym then else save-mflag 75 test-token if nextsym
else 77 test-token ?nextsym then then else 5 testsym? if save-sflag 79
test-token if nextsym else 81 test-token ?nextsym then else
save-xflag 83 test-token if nextsym else 85 test-token ?nextsym
then then then ; : <escchar> 6 testsym? if 91 test-token if ^bel
nextsym else 7 testsym? if 93 test-token if ^esc nextsym else
^ff 94 test-token ?nextsym then else 95 test-token if ^lf nextsym
else ^cr 96 test-token ?nextsym then then then else 8 testsym? if 9 testsym? if
97 test-token if ^ht nextsym else ^vt 98 test-token ?nextsym then
else 102 test-token if ^nul nextsym else bl 48 test-token
?nextsym then then else 10 testsym? if 49 test-token if [char] #
nextsym else get-controlchar 92 test-token ?nextsym then else 99 test-token if
get-smallxnum nextsym else get-bigxnum 100 test-token ?nextsym
101 test-token ?nextsym then then then then ; : <metachar1> 11 testsym? if 36
test-token if [char] ] nextsym else [char] ^ 39 test-token
?nextsym then else 40 test-token if [char] $ nextsym else
[char] \ 46 test-token ?nextsym then then ; : <escapedccchar> 12 testsym? if 13
testsym? if <escchar> else ^bs 55 test-token ?nextsym then else 14 testsym? if
<metachar1> else [char] - 47 test-token ?nextsym then then add-to-class ;
: <rangespec> 87 test-token ?nextsym 0 test-token if add-range
nextsym else <escapedccchar> then ; : <presetchars> 15 testsym? if 27
test-token if \d-set nextsym else 29 test-token if \w-set
nextsym else \s-set 31 test-token ?nextsym then then
add-set-to-class else 28 test-token if \d-set nextsym else 30
test-token if \w-set nextsym else \s-set 32 test-token ?nextsym
then then add-allbut-to-class then ; : <charorrange> 16 testsym? if 0 test-token
if add-char nextsym else <escapedccchar> then 87 test-token if
<rangespec> then else <presetchars> then ; : <presetclass> 15 testsym? if 27
test-token if \d-set nextsym else 29 test-token if \w-set
nextsym else \s-set 31 test-token ?nextsym then then new-char-class
else 28 test-token if \d-set nextsym else 30 test-token if \w-set
nextsym else \s-set 32 test-token ?nextsym then then
new-negchar-class then ; : <charclass> 17 testsym? if get-xflag >r 18 testsym?
if 23 test-token if open-charclass nextsym else open-negcharclass
25 test-token ?nextsym then begin <charorrange> 19 testsym? 0= until else 24
test-token if open-charclass [char] - add-to-class nextsym else
open-charclass [char] ] add-to-class 26 test-token ?nextsym then 87 test-token
if <rangespec> then begin 19 testsym? while <charorrange> repeat then
close-charclass r> save-xflag nextsym else <presetclass> then ;
: <metachar> 20 testsym? if 33 test-token if [char] { nextsym
else 34 test-token if rbrace nextsym else [char] [ 35 test-token
?nextsym then then else 21 testsym? if 22 testsym? if 37 test-token if [char] (
nextsym else [char] ) 38 test-token ?nextsym then else 41
test-token if [char] . nextsym else [char] | 42 test-token
?nextsym then then else 23 testsym? if 43 test-token if [char] *
nextsym else [char] + 44 test-token ?nextsym then else 45 test-token if
[char] ? nextsym else <metachar1> then then then then litchar ;
defer <subregex>
: <lookaround> get-modifiers >r 24 testsym? if 59 test-token if 0
nextsym else -1 60 test-token ?nextsym then new-lookback 25 testsym? if
<subregex> then new-EOS <.> else 57 test-token if nextsym
new-lookahead else 58 test-token ?nextsym new-neglookahead then 25 testsym? if
<subregex> then then end-look 62 test-token ?nextsym r> modifiers ! ;
defer <modifier>
defer <concat>
: <conditional> 66 test-token ?nextsym 61 test-token if name/num-test
nextsym 62 test-token ?nextsym else 26 testsym? if 0 <lookaround>
else embed-test 67 test-token ?nextsym 62 test-token ?nextsym then then
new-conditional >r <.> 0 25 testsym? if begin 27 testsym? while <modifier>
repeat <concat> then ?null<.> 0 3 test-token if nextsym 25
testsym? if begin 27 testsym? while <modifier> repeat <concat> then then
r> else<.> 62 test-token ?nextsym ; : <group> get-modifiers >r 28 testsym? if 61
test-token if open-paren >r nextsym <subregex>
<.> r> close-paren <.> 62 test-token ?nextsym else 63 test-token if
nextsym <subregex> 62 test-token ?nextsym else
open-paren get-capname 2>r 64 test-token ?nextsym <subregex>
<.> 2r> close-capparen <.> 62 test-token ?nextsym then then else 29 testsym? if
65 test-token if nextsym new-atomic 25 testsym? if <subregex>
then end-look 62 test-token ?nextsym else <conditional> then else 67 test-token
if embed-code nextsym 62 test-token ?nextsym else notyet 68
test-token ?nextsym 69 test-token ?nextsym then then then r> modifiers ! ;
: <anchor> 30 testsym? if 50 test-token if new-textstart nextsym
else 51 test-token if 0 textstart nextsym else new-EOL 52
test-token ?nextsym then then else 31 testsym? if 53 test-token if new-EOS
nextsym else new-$mode0 54 test-token ?nextsym then else 55
test-token if new-boundary nextsym else new-notboundary 56
test-token ?nextsym then then then ; : <escapedchar> <escchar> litchar ;
: <reference> 109 test-token if new-namedref nextsym else
new-backref 108 test-token ?nextsym then ; : <term> 32 testsym? if
curr-char @ litchar 0 test-token if nextsym else 2 test-token
?nextsym then else 33 testsym? if 34 testsym? if 35 testsym? if <charclass> else
<metachar> then else 36 testsym? if <group> else new-dot 22 test-token ?nextsym
then then else 37 testsym? if 38 testsym? if <anchor> else -1 <lookaround> then
else 13 testsym? if <escapedchar> else <reference> then then then then ;
: <greedyquant> 39 testsym? if 4 test-token if <*> nextsym else
<+> 5 test-token ?nextsym then else 40 testsym? if 6 test-token if <?>
nextsym else new-interval 7 test-token ?nextsym then else 8
test-token if new-atleastn nextsym else new-exactlyn 9 test-token
?nextsym then then then ; : <lazyquant> 41 testsym? if 10 test-token if <*?>
nextsym else <+?> 11 test-token ?nextsym then else 42 testsym? if 12
test-token if <??> nextsym else new-lazy-interval 13 test-token
?nextsym then else 14 test-token if new-lazyn+ nextsym else
new-lazyn 15 test-token ?nextsym then then then ; : <possquant> make-atomic 43
testsym? if 16 test-token if <*> nextsym else <+> 17 test-token
?nextsym then else 44 testsym? if 18 test-token if <?> nextsym
else new-interval 19 test-token ?nextsym then else 20 test-token if new-atleastn
nextsym else new-exactlyn 21 test-token ?nextsym then then then
end-look ; : <quantifier> 45 testsym? if <greedyquant> else 46 testsym? if
<lazyquant> else <possquant> then then ; : <closure> <term> 47 testsym? if
<quantifier> then begin 27 testsym? while <modifier> repeat ; :noname <closure>
begin 48 testsym? while <closure> <.> repeat ; is <concat> :noname begin 27
testsym? while <modifier> repeat <concat> begin 3 test-token while
nextsym <concat> <|> repeat ; is <subregex> : <modemod> 49 testsym? if
get-switch <savemod> else get-modifiers >r get-switch <savemodspan> <subregex>
r> modifiers ! 62 test-token ?nextsym 47 testsym? if <quantifier> then <.> then ;
: <casefolder> 103 test-token if save-llflag nextsym else 50
testsym? if 105 test-token if save-luflag nextsym else
save-uLflag 104 test-token ?nextsym then else 106 test-token if save-uUflag
nextsym else clear-caseflags 107 test-token ?nextsym then then then ;
:noname 51 testsym? if <modemod> else <casefolder> then ; is <modifier>
: <regex> 27 testsym? if <modifier> then 25 testsym? if <subregex> then
end-regex 1 test-token if nextsym else 86 test-token ?nextsym then ;
: this-parser <regex> ;
' this-parser to parser-name
: ~ 0 0 parse-name >number 2drop drop 4 0 do dup c, 8 rshift loop drop ;
here to first-set base @
decimal 36 base ! ~ 0 ~ 0 ~ 474 ~ 0 ~ 0 ~ 0 ~ 8W ~ 0 ~ 0 ~ 0 ~ 1R7K ~ 0 ~ 0 ~ 0
~ 8E8 ~ 0 ~ 0 ~ 0 ~ HS ~ 0 ~ 0 ~ 0 ~ 3IF4 ~ 0 ~ 0 ~ 0 ~ 1SDDSSG ~ 1 ~ 0 ~ 0
~ QMX0QO ~ 0 ~ 0 ~ 1EKG ~ 0 ~ 1Y ~ 0 ~ 0 ~ 0 ~ 6 ~ 0 ~ 2T4W ~ 4FTI4G ~ 0 ~ 0
~ 40 ~ 0 ~ 0 ~ 0 ~ 540E8 ~ 1WT7AWW ~ 2N ~ 0 ~ 47PC ~ 1WT7AWW ~ 2N ~ 0 ~ CY8 ~ 0
~ 0 ~ 1AM3SAO ~ 0 ~ 0 ~ 0 ~ 1 ~ 552MO ~ 1WT7AWW ~ 2N ~ 22WYDC ~ 0 ~ 0 ~ 0
~ OYZGG ~ 0 ~ 0 ~ 0 ~ 1WT7AWX ~ 552MP ~ 1WT7AWW ~ 2N ~ 0 ~ E ~ 0 ~ 0 ~ 0 ~ 19C
~ 0 ~ 0 ~ 0 ~ 2O ~ 0 ~ 0 ~ 0 ~ 4QO ~ 0 ~ 0 ~ 0 ~ 6NQ96O ~ 0 ~ 0 ~ 1YYM5MT
~ 1H9TC73 ~ 1WVP78F ~ CM7 ~ 0 ~ 8BNTHC ~ 0 ~ 0 ~ 0 ~ 0 ~ 2HWAO ~ 328 ~ 0
~ 18E718G ~ 1 ~ 0 ~ 0 ~ 0 ~ 6 ~ 0 ~ 0 ~ 13BWG ~ 0 ~ 0 ~ 0 ~ 3QUIO ~ 0 ~ 0 ~ 5
~ 0 ~ 0 ~ 0 ~ 1YYM5MO ~ 18E7QIN ~ V ~ 0 ~ 1YW49A8 ~ PA7 ~ 0 ~ 0 ~ 1YW49A8 ~ 1
~ 0 ~ 0 ~ 0 ~ 18E718G ~ V ~ 0 ~ 0 ~ 8VHDZ4 ~ 0 ~ 0 ~ 0 ~ JTKHS ~ 0 ~ 0 ~ 1C ~ 0
~ 0 ~ 0 ~ 5C ~ 0 ~ 0 ~ 0 ~ 2DC ~ 0 ~ 0 ~ 0 ~ 9HC ~ 0 ~ 0 ~ 0 ~ 47PC ~ 0 ~ 0 ~ 0
~ GUTC ~ 0 ~ 0 ~ 0 ~ S0 ~ 0 ~ 0 ~ 0 ~ 1DS0 ~ 0 ~ 0 ~ 0 ~ 2HWC0 ~ 0 ~ 0 ~ 0
~ 1YYM5MT ~ 1H9TC73 ~ 1WT7AXR ~ 9JZ ~ 0 ~ 0 ~ TYRK ~ 0 ~ 0 ~ 0 ~ 0 ~ LC ~ 0 ~ 0
~ 2HWAO ~ 0 base !
\ ------------------------------------------------------------------------------
\ Regex - Skip parts of a pattern string in String Builder
[defined] [rgx-dev] [if] .( Loading skip.fth ...) cr [then]
\ ---[ Free parser-name and first-set for skip parser ]-------------------------
parser-name to regex-parser-name
first-set to parser-first-set
\ ------------------------------------------------------------------------------
0 value skip-first-set
\ testsym? revised for operation with >32 bit Forths
: testsym? ( set-index -- f ) skip-first-set (testsym?) ;
\ ------------------------------------------------------------------------------
: skip-code ( -- ) [char] } parse-to-char 2drop ;
\ ------------------------------------------------------------------------------
: skipper ( xt -- )
sk-token regex-mode !
\ nextsym execute
execute
sb-token regex-mode !
;
\ ------------------------------------------------------------------------------
[defined] [rgx-dev] [if] .( skip.fth loaded ) .s [then]
\ Parser generated by Grace
\ See http://www.qlikz.org/forth/grace/grace.html
16 to bytes/set
32 to bits/cell
: <forth> skip-code 67 test-token ?nextsym 69 test-token ?nextsym ; : <test>
[char] ) parse-to-char 2drop 61 test-token ?nextsym 62 test-token ?nextsym ;
defer <skipitems>
: <freelayout> 0 testsym? if get-switch save-xflag 82 test-token if
nextsym else 84 test-token ?nextsym then else
get-modifiers >r get-switch save-xflag 83 test-token if nextsym
else 85 test-token ?nextsym then begin 1 testsym? while <skipitems> repeat
r> modifiers ! 62 test-token ?nextsym then ;
defer <skip_strcode>
:noname 0 test-token if
nextsym else 2 testsym? if 3 testsym? if 2 test-token if
nextsym else <forth> then else 66 test-token if
nextsym 67 test-token if <forth> else <test> then <skip_strcode> 3 test-token
if nextsym <skip_strcode> then 62 test-token ?nextsym else
<freelayout> then then else 4 testsym? if 114 test-token if
nextsym else 115 test-token ?nextsym then else 116 test-token if
nextsym else 117 test-token ?nextsym then then then then ; is <skipitems>
:noname begin 1 testsym? while <skipitems> repeat ; is <skip_strcode>
: this-parser <skip_strcode> ;
' this-parser to parser-name
: ~ 0 0 parse-name >number 2drop drop 4 0 do dup c, 8 rshift loop drop ;
here to first-set base @
decimal 36 base ! ~ 0 ~ 0 ~ S3CW ~ 0 ~ 5 ~ 0 ~ 2CA30 ~ 2CA2O ~ 4 ~ 0 ~ 2CA30 ~ 0
~ 4 ~ 0 ~ 8 ~ 0 ~ 0 ~ 0 ~ 0 ~ GUTC base !
\ ------------------------------------------------------------------------------
\ Regex - String builder module
\ To use results from a regex match
[defined] [rgx-dev] [if] .( Loading strbuild.fth ...) cr [then]
\ ---[ Free first-set for string builder parser ]-------------------------------
first-set to skip-first-set
\ ---[ Concatenation buffer ]---------------------------------------------------
object class
1 cells var cb-here
1 cells var cb-hi
1 cells var cb-lo \ Start of buffer storage area, dynamically allocated
1 cells - \ Therefore no space is needed for cb-lo
method cb-clear
method cb-concat
method cb-ch>cb
method cb-get
end-class ConcatBuffer
:noname ( size class -- buf )
dup >r @ + aligned dup allocate ( -- size' buf ior )
abort" Unable to allocate concatenation buffer"
r> over !
tuck + over cb-hi !
dup cb-clear
; ConcatBuffer defines new
:noname ( buf -- )
free abort" Delete concatenation buffer failed"
; ConcatBuffer defines delete
:noname ( buf -- )
dup cb-lo swap cb-here !
; ConcatBuffer defines cb-clear
: ?cb ( u buf -- ad )
dup >r cb-here @ 2dup + r@ cb-hi @ >= ( -- u ad f )
abort" Concatenation buffer overflow"
swap r> cb-here +!
;
:noname ( caddr u buf -- )
over swap ?cb swap cmove
; ConcatBuffer defines cb-concat
:noname ( char buf -- )
1 chars swap ?cb c!
; ConcatBuffer defines cb-ch>cb
:noname ( buf -- caddr u )
dup cb-lo swap cb-here @ over -
; ConcatBuffer defines cb-get
1024 constant cb-size
cb-size ConcatBuffer new constant cb
: clear-concat ( -- ) cb cb-clear ;
: concat ( caddr u -- ) cb cb-concat ;
: concat-char ( char -- ) cb cb-ch>cb ;
: get-concat ( -- caddr u ) cb cb-get ;
\ ------------------------------------------------------------------------------
2variable sb-source
\ Test if sym has its bit number set in the bit vector
\ testsym? revised for operation with >32 bit Forths
: testsym? ( set-index -- f ) first-set (testsym?) ;
\ ---[ Extra mode bits ]--------------------------------------------------------
\ Using the modifiers word defined for the regex compiler
\ Bit
\ 10 9 8
\ ~~~~~~
\ x x 0 {-c} don't clear concatenation buffer
\ x x 1 {c} clear concatenation buffer
\ x 0 x {-g} don't get resulting string at end of pattern
\ x 1 x {g} get resulting string at end of pattern
base @ decimal
256 constant c-flag
512 constant g-flag
base !
c-flag g-flag or default-sbmods ! \ c and g set by default
: save-cflag ( f -- ) c-flag save-mod ;
: save-gflag ( f -- ) g-flag save-mod ;
: get-cflag ( -- f ) c-flag get-mod ;
: get-gflag ( -- f ) g-flag get-mod ;
\ ------------------------------------------------------------------------------
: char-cat ( ch -- )
get-case clear-lu-flag map-case2 + c@
case 2 of >lower endof
3 of >upper endof
endcase
concat-char
;
: str-concat ( caddr u -- )
over + swap ?do i c@ char-cat loop
;
\ In subex-cat, when the #subex value is stored with each compiled regex, which
\ it needs to be, the test should compare the reference number with the #subex
\ value of the regex used in the last execution of match.
: subex-cat ( -- )
sym>num ( -- u )
dup 0 subex-limit 1+ within 0=
if ." String builder back reference out of range" report-error then
subex-match @ 0=
if ." No subexpression available" report-error then
get-subex str-concat ( -- )
;
: name-cat ( -- )
get-refname execute ( -- caddr u )
str-concat ( -- )
;
: get-head ( -- )
subject 2@ drop get-match ( -- caddr caddr2 u2 f )
if drop over - else 2drop subject @ then ( -- caddr u )
concat
;
: get-matching ( -- ) get-match drop concat ;
: get-tail ( -- )
get-match
if
+ subject 2@ + ( -- caddr1 caddr2 )
over - concat ( -- )
else
2drop
then
;
\ In substitute
\ (caddr1 u1) is the result from match
\ (caddr2 u2) is the replacement string for (caddr1 u1)
\ Typical usage is:
\ get-input-string ( -- caddr u )
\ rgx \ where rgx is a compiled regular expression
\ match ?dup ( -- caddr1 u1 )
\ if
\ build-replacement-text ( -- caddr1 u1 caddr2 u2 )
\ substitute ( -- caddr3 u3 )
\ save-result ( -- )
\ then
: substitute ( caddr1 u1 caddr2 u2 -- caddr u4 )
clear-concat get-head concat concat get-concat
;
: run-code ( -- ) [char] } parse-to-char evaluate ;
: ?clear-concat ( caddr u -- )
drop s" (?-c)" tuck compare get-cflag and
if clear-concat then
;
: do-cflag ( f -- )
dup if clear-concat then
save-cflag
;
\ condjump used in conditional commands to jump to the else part or the end
: ?sym| ( -- f )
symname 2@ s" |" compare 0=
;
: skip-to-end ( -- ) ['] <skip_strcode> skipper ;
: condjump ( f -- )
if
?sym| \ Empty then part
if nextsym skip-to-end then
else
['] <skip_strcode> skipper
then
;
: jump-to-end ( -- )
?sym| if nextsym then
skip-to-end
;
: ?reference ( -- f )
[char] ) parse-to-char ( -- caddr u )
0 0 2over >decnumber ( -- caddr u ud caddr1 u1 )
if \ A name not a number
drop 2drop find-xt ( -- caddr 0 | xt <>0 )
?name-found ( -- xt )
execute ( -- caddr2 u2 )
else \ A number, therefore a subexpression reference
2drop get-subex ( -- caddr u caddr3 u3 )
2swap 2drop ( -- caddr3 u3 )
then
or 0<> ( -- f )
;
\ ---[ Process a format specification ]-----------------------------------------
\ Don't change the value of the first 3 flags, they are used for sign
\ character calculations
base @ decimal
1 constant plus-flag 2 constant space-flag 4 constant neg-flag
8 constant left-flag 16 constant zero-flag 32 constant hash-flag
64 constant long-flag 128 constant prec-flag 256 constant ucase-flag
base !
\ : clear-flags ( flags -- ) invert flags @ and flags ! ;
\ Format base class
object class
1 cells var fm-flags
1 cells var fm-width
1 cells var fm-prec
method >string
method get-prefix
end-class Format
: set-flag ( flag fmt -- ) dup >r fm-flags @ or r> fm-flags ! ;
: ?flags ( flag fmt -- f ) fm-flags @ and ;
:noname ( width prec flags class -- fmt )
[ object :: new ] >r
r@ fm-flags ! r@ fm-prec ! r@ fm-width !
r>
; Format defines new
:noname ( fmt -- fmt 0 ) 0 ; Format defines get-prefix
\ --- Decimal format class, handles 'd' and 'ld'
Format class end-class d-Format
: ud>str ( ud fmt -- caddr u )
>r 2dup or r> fm-prec @ or 0<> >r \ if ud = prec = 0 return (caddr 0)
<# #s #> r> and
;
:noname ( n | d fmt -- caddr u )
>r dup 0< neg-flag and r@ set-flag
long-flag r@ ?flags if dabs else abs 0 then ( -- d1 )
r> ud>str ( -- caddr u )
; d-format defines >string
create sign-char
0 c, char + c, bl c, char + c, char - dup dup dup c, c, c, c, align
:noname ( fmt -- caddr u )
fm-flags @
[ plus-flag space-flag or neg-flag or ] literal and
dup sign-char + swap 0= 1+
; d-Format defines get-prefix
\ --- Unsigned decimal format class, handles 'u' and 'lu'
Format class end-class u-Format
:noname ( u | ud fmt -- caddr1 u1 )
long-flag over ?flags 0= if 0 swap then
ud>str
; u-format defines >string
\ --- Character format class, handles 'c'
Format class end-class c-Format
:noname ( width prec flags class -- fmt )
[ Format :: new ] ( -- fmt )
1 over fm-prec !
; c-Format defines new
:noname ( ch fmt -- caddr u )
drop <# hold 0 0 #>
; c-format defines >string
\ --- String format class, handles 's'
Format class end-class s-Format
:noname ( caddr u fmt -- caddr u' )
>r prec-flag r@ ?flags if r@ fm-prec @ min then
dup r> fm-prec !
; s-format defines >string
\ --- Hexadecimal class, handles x and X
u-Format class end-class x-Format
:noname ( u | ud fmt -- caddr1 u1 )
hex
dup >r [ u-Format :: >string ] ( -- caddr u )
2dup ucase-flag r> ?flags
if str>upper else str>lower then
; x-Format defines >string
:noname ( fmt -- caddr u )
hash-flag over ?flags
if
ucase-flag swap ?flags
if s" 0X" else s" 0x" then ( -- caddr u )
else
0 ( -- fmt 0 )
then
; x-Format defines get-prefix
\ --- Hexadecimal class, handles x and X
x-Format class end-class o-Format
:noname ( u | ud fmt -- caddr1 u1 )
8 base !
[ u-Format :: >string ] ( -- caddr1 u1 )
; o-Format defines >string
:noname ( fmt -- caddr u )
hash-flag swap ?flags
if s" 0" else pad 0 then
; o-Format defines get-prefix
\ --- Percent class, handles %
c-Format class end-class %-Format
:noname ( fmt -- caddr 1 )
drop s" %"
; %-Format defines >string
: new-format ( width prec flags caddr -- fmt caddr' )
dup >r c@ dup [char] A [char] Z 1+ within
if swap ucase-flag or swap >lower then
case
[char] d of d-Format endof
[char] u of u-Format endof
[char] o of o-Format endof
[char] x of x-Format endof
[char] c of c-Format endof
[char] s of s-Format endof
[char] % of %-Format endof
." Invalid conversion operator" report-error
endcase
new r> char+ ( -- fmt caddr' )
;
\ Returns u2 = number of precision 0's needed
\ u3 = number of padding characters needed
: calc-#prec&pad ( u fmt -- u2 u3 ) \ u is converted string length
2>r r@ fm-width @ ( -- width )
2r@ fm-prec @ swap - 0 max tuck ( -- u2 width u2 )
r> get-prefix nip +
r> + - 0 max ( -- u2 u3 )
;
: pad-chars ( ch u -- )
0 ?do dup concat-char loop drop
;
: do-conversion ( [x] x2 fmt -- )
dup >r >string ( -- caddr u ) ( R: -- fmt )
dup r@ calc-#prec&pad ( -- caddr u u2 u3 )
zero-flag r@ ?flags 0= left-flag r@ ?flags 0= and
if bl over pad-chars then
r@ get-prefix concat
zero-flag r@ ?flags left-flag r@ ?flags 0= and
if [char] 0 over pad-chars then
[char] 0 rot pad-chars ( -- caddr u u3 )
>r concat r> ( -- u3 )
left-flag r> ?flags if bl over pad-chars then
drop
;
: add-flag ( flags ch -- flags' f ) \ f = false for non-flag character
case
[char] - of left-flag endof
[char] 0 of zero-flag endof
[char] + of plus-flag endof
[char] # of hash-flag endof
bl of space-flag endof
0 swap
endcase
dup >r or r>
;
: read-flags ( caddr1 -- caddr2 flags )
0
begin
over c@ add-flag ( -- caddr1 flags f )
while
>r char+ r>
repeat
;
: read-int ( [u0] caddr1 u1 -- caddr2 u2 u )
over c@ [char] * = \ Width or precision on the stack
if
1 /string
else
0 0 2swap >decnumber rot drop
then
rot
;
: read-precision ( caddr1 u1 -- caddr2 u2 u flag )
over c@ [char] . =
if 1 /string read-int prec-flag else 1 0 then
;
: read-long ( caddr -- caddr' flag )
dup c@ [char] l = if char+ long-flag else 0 then
;
: do-format ( [u1] [u2] -- ) \ u1 = prec & u2 = width may be on the stack
base @ >r decimal
get-pos ( -- caddr )
read-flags >r ( -- caddr1 ) ( R: -- flags )
reset-pos ( -- )
[char] ) parse-to-char ( -- caddr2 u2 )
read-int >r ( -- caddr3 u3 ) ( R: -- flags width )
read-precision 2>r drop ( -- caddr4 ) ( R: -- flags width prec flag )
read-long r> or ( -- caddr5 flag' ) ( R: -- flags width prec )
2r> 2swap r> or swap ( -- width prec flags' caddr5 )
new-format reset-pos ( -- fmt )
dup >r do-conversion ( -- ) ( R: -- fmt )
r> delete
r> base !
;
\ ------------------------------------------------------------------------------
: init-stringer ( caddr u -- )
sb-token regex-mode ! init-sbmodifiers
2dup sb-source 2! 2dup regex-source 2! set-regex \ ??? Need to rationalise report-error etc
;
: (stringer) ( caddr u -- caddr2 u2 )
2dup init-stringer ?clear-concat
nextsym parser-name execute
get-gflag if get-concat then
;
: stringer$ ( caddr u -- caddr2 u2 )
save-global-state >r
0 to source-flag (stringer)
r> restore-global-state
;
: parse-stringer ( char "ccc<char>" -- caddr u )
parse stringer$
;
: stringer ( -- caddr u )
save-global-state >r
source >in @ /string -1 to source-flag
(stringer)
source nip regex-str @ - >in !
r> restore-global-state
;
\ stringify reads a free format string pattern from the following input source,
\ strips out any white space (including CRs and LFs) and converts it into a
\ single string in the concatenation buffer. The user can then use CONSTANT$
\ to make it a string constant. The purpose is to provide a readable pattern
\ that can be used as a string in a colon definition. Stringify recognises
\ and acts on the free format commands (?x) and (?-x) but not free format
\ spans (?x: ...). The input pattern ends with (?end) or (?e).
\ An initial (?-c) will prevent the concatenation buffer being cleared so that
\ an existing string can be appended to. Without (?-c) it overwrites any user
\ data in the concatenation buffer. Stringify can also be used for making
\ regular expression strings but any white space inside character classes will
\ be stripped out (use \s instead).
: ?xc-command ( caddr u tok -- caddr u tok | caddr2 u2 tok2 )
dup (?-c)tok <> if clear-concat then
dup (?x)tok = over (?-c)tok = or
if drop 2drop get-token then ( -- caddr2 u2 tok2 )
;
\ In free-format mode white space and comments are filtered out by get-token
: (stringify) ( caddr u tok -- )
begin
case (?x)tok of 2drop -1 save-xflag endof
(?-x)tok of 2drop 0 save-xflag endof
(?end)tok of 2drop exit endof
end-of-line of 2drop exit endof
-rot concat
endcase
get-token
again
;
: stringify ( -- caddr u )
source >in @ /string -1 to source-flag
init-stringer -1 save-xflag
get-token ?xc-command
(stringify)
source nip regex-str @ - >in !
get-concat
;
: constant$ ( caddr u "spaces<name>" -- ) \ <name> execution ( -- caddr u )
here swap dup chars allot 2dup 2>r cmove
2r> 2constant
;
\ ---[ GlobalState methods ]----------------------------------------------------
\ Used for recursing regex and stringer
\ ??? Needs to be rationalised with the Matcher object
object class
2 cells var gl-regexstr
2 cells var gl-rgxsource
2 cells var gl-sbsource
1 cells var gl-mods
1 cells var gl-sourceflag
1 cells var gl-sym
2 cells var gl-symname
1 cells var gl-rgxmode
end-class GlobalState
:noname ( -- obj )
GlobalState [ GlobalState :: new ] >r ( R: -- obj )
regex-str 2@ r@ gl-regexstr 2!
regex-source 2@ r@ gl-rgxsource 2!
sb-source 2@ r@ gl-sbsource 2!
modifiers @ r@ gl-mods !
source-flag r@ gl-sourceflag !
sym @ r@ gl-sym !
symname 2@ r@ gl-symname 2!
regex-mode @ r@ gl-rgxmode !
r>
; is save-global-state
:noname ( obj -- )
dup gl-regexstr 2@ regex-str 2!
dup gl-rgxsource 2@ regex-source 2!
dup gl-sbsource 2@ sb-source 2!
dup gl-mods @ modifiers !
dup gl-sourceflag @ to source-flag
dup gl-sym @ sym !
dup gl-symname 2@ symname 2!
dup gl-rgxmode @ regex-mode !
[ GlobalState :: delete ]
; is restore-global-state
\ ------------------------------------------------------------------------------
[defined] [rgx-dev] [if] .( strbuild.fth loaded ) .s [then]
\ Parser generated by Grace
\ See http://www.qlikz.org/forth/grace/grace.html
16 to bytes/set
32 to bits/cell
: <metachar> 0 testsym? if 46 test-token if [char] \ nextsym else
[char] ( 37 test-token ?nextsym then else 1 testsym? if 2 testsym? if 38
test-token if [char] ) nextsym else [char] $ 40 test-token
?nextsym then else 41 test-token if [char] . nextsym else
[char] { 33 test-token ?nextsym then then else 3 testsym? if 34 test-token if
rbrace nextsym else [char] # 49 test-token ?nextsym then else 42
test-token if [char] | nextsym else [char] % 113 test-token
?nextsym then then then then concat-char ; : <forth> run-code 67 test-token
?nextsym 69 test-token ?nextsym ;
defer <items>
: <group> 61 test-token if get-modifiers >r
nextsym <items> 62 test-token ?nextsym r> modifiers ! else <forth>
then ; : <escapedchar> 4 testsym? if 91 test-token if ^bel
nextsym else ^esc 93 test-token ?nextsym then else 5 testsym? if 6 testsym? if
94 test-token if ^ff nextsym else ^lf 95 test-token ?nextsym then
else 96 test-token if ^cr nextsym else ^ht 97 test-token ?nextsym
then then else 7 testsym? if 98 test-token if ^vt nextsym else
^nul 102 test-token ?nextsym then else 8 testsym? if bl 48 test-token if
nextsym else 31 test-token ?nextsym then else get-controlchar 92
test-token ?nextsym then then then then concat-char ; : <reference> 108
test-token if subex-cat nextsym else 9 testsym? if 109
test-token if name-cat nextsym else get-head 110 test-token
?nextsym then else 111 test-token if get-matching nextsym else
get-tail 112 test-token ?nextsym then then then ;
defer <itemormod>
: <thenpart> begin <itemormod>
10 testsym? 0= until jump-to-end ; : <conditional> 66 test-token ?nextsym 67
test-token if <forth> else ?reference nextsym 62 test-token
?nextsym then condjump 10 testsym? if <thenpart> then 3 test-token if
nextsym <items> then 62 test-token ?nextsym ; : <control>
<conditional> ; : <formatspec> do-format 119 test-token if
nextsym else 118 test-token ?nextsym 62 test-token ?nextsym then ; : <command>
11 testsym? if get-switch 12 testsym? if do-cflag 114 test-token if
nextsym else 115 test-token ?nextsym then else save-gflag 116
test-token if nextsym else 117 test-token ?nextsym then then
else <formatspec> then ; : <item> 13 testsym? if 14 testsym? if
curr-char @ char-cat 0 test-token if nextsym else 2 test-token
?nextsym then else 15 testsym? if <metachar> else <group> then then else 16
testsym? if 17 testsym? if <escapedchar> else <reference> then else 66
test-token if <control> else <command> then then then ; : <modemod> 18 testsym?
if get-switch save-xflag 82 test-token if nextsym else 84
test-token ?nextsym then else get-modifiers >r get-switch save-xflag 83
test-token if nextsym else 85 test-token ?nextsym then <items>
r> modifiers ! 62 test-token ?nextsym then ; : <casefolder> 103 test-token if
save-llflag nextsym else 19 testsym? if 105 test-token if
save-luflag nextsym else save-uLflag 104 test-token ?nextsym
then else 106 test-token if save-uUflag nextsym else
clear-caseflags 107 test-token ?nextsym then then then ; : <modifier> 20
testsym? if <modemod> else <casefolder> then ; :noname 21 testsym? if <item>
else <modifier> then ; is <itemormod> :noname begin 10 testsym? while
<itemormod> repeat ; is <items> : <stringer> <items> 0 save-xflag 1 test-token
if nextsym else 86 test-token ?nextsym then ; : this-parser
<stringer> ;
' this-parser to parser-name
: ~ 0 0 parse-name >number 2drop drop 4 0 do dup c, 8 rshift loop drop ;
here to first-set base @
decimal 36 base ! ~ 0 ~ CO0 ~ 0 ~ 0 ~ 0 ~ N6 ~ 0 ~ 0 ~ 0 ~ 8W ~ 0 ~ 0 ~ 0 ~ 2T50
~ 0 ~ 0 ~ 0 ~ 0 ~ B3JRB4 ~ 0 ~ 0 ~ 0 ~ 1H9U1HC ~ 3 ~ 0 ~ 0 ~ 1H9U1HC ~ 0 ~ 0 ~ 0
~ 0 ~ 1W ~ ZIK0ZK ~ 1EKG ~ 0 ~ 0 ~ 0 ~ 0 ~ 0 ~ IYO ~ ZIK0ZP ~ 8VRM1Y ~ 1WVJKZW
~ 9ZLC7 ~ 0 ~ 0 ~ 0 ~ 2CA2O ~ 0 ~ 0 ~ 0 ~ GUTC ~ 5 ~ 8VQ7HI ~ 8 ~ 2T4W ~ 5 ~ 0
~ 0 ~ 0 ~ 0 ~ 378M ~ 0 ~ 2T4W ~ ZIK0ZK ~ 1EKG ~ 1WT7AWW ~ 2Q13 ~ ZIK0ZK ~ 1EKG
~ 1WT7AWW ~ 1Z ~ 0 ~ 0 ~ S3CW ~ 0 ~ 0 ~ 0 ~ 0 ~ LC ~ 0 ~ 0 ~ 2CA2O ~ 0 ~ ZIK0ZP
~ 8VRM1Y ~ 1WT7AX8 ~ 9ZI9Z base !
\ ------------------------------------------------------------------------------
rgx-user-base base !
cr cr .( Regex version 0.8 )
cr .( Copyright (C) char ) emit .( Gerry Jackson 2010) cr cr
\ ------------------------------------------------------------------------------