|
Keep in mind this was generated by a tool, so the table names would be different for a different input specification file. The trick with this routine is filling the state tables properly
The reason it's so nasty is fetching the next UTF32 codepoint in SQL is a pain in my backside
CREATE PROCEDURE [dbo].[Example_Match] @value NVARCHAR(MAX), @symbolId INT
AS
BEGIN
DECLARE @valueEnd INT = DATALENGTH(@value)/2+1
DECLARE @index INT = 1
DECLARE @ch BIGINT
DECLARE @ch1 NCHAR
DECLARE @ch2 NCHAR
DECLARE @tch BIGINT
DECLARE @state INT = 0
DECLARE @toState INT = -1
DECLARE @accept INT = -1
DECLARE @position BIGINT = 0
DECLARE @capture NVARCHAR(MAX)
DECLARE @blockEndId INT
DECLARE @result INT = 0
DECLARE @len INT = 0
DECLARE @done INT = 0
CREATE TABLE #Results (
[Position] BIGINT NOT NULL,
[Value] NVARCHAR(MAX) NOT NULL,
[Length] INT NOT NULL
)
IF @index >= @valueEnd
BEGIN
SET @ch = -1
END
ELSE
BEGIN
SET @ch1 = SUBSTRING(@value,@index,1)
SET @ch = UNICODE(@ch1)
SET @tch = @ch - 0xd800
IF @tch < 0 SET @tch = @tch + 2147483648
IF @tch < 2048
BEGIN
SET @ch = @ch * 1024
SET @index = @index + 1
IF @index >= @valueEnd RETURN -1
SET @ch2 = SUBSTRING(@value,@index,1);
SET @ch = @ch + UNICODE(@ch2) - 0x35fdc00
END
END
WHILE @ch <> -1
BEGIN
SET @capture = N''
SET @position = @index - 1
SET @done = 0
WHILE @done = 0
BEGIN
SET @done = 1
SET @toState = -1
SELECT @toState = [dbo].[ExampleStateTransition].[ToStateId] FROM [dbo].[ExampleState] INNER JOIN [dbo].[ExampleStateTransition] ON [dbo].[ExampleState].[StateId]=[dbo].[ExampleStateTransition].[StateId] AND [dbo].[ExampleState].[SymbolId]=[dbo].[ExampleStateTransition].[SymbolId] AND [dbo].[ExampleStateTransition].[BlockEndId]=-1 WHERE [dbo].[ExampleState].[SymbolId]=@symbolId AND [dbo].[ExampleState].[StateId]=@state AND [dbo].[ExampleState].[BlockEndId] = -1 AND @ch BETWEEN [dbo].[ExampleStateTransition].[Min] AND [dbo].[ExampleStateTransition].[Max]
IF @toState <> -1
BEGIN
SET @done = 0
SET @state = @toState;
SET @capture = @capture + @ch1
IF @tch < 2048 SET @capture = @capture + @ch2
SET @index = @index + 1
IF @index >= @valueEnd
BEGIN
SET @ch = -1
SET @done = 1
END
ELSE
BEGIN
SET @ch1 = SUBSTRING(@value,@index,1)
SET @ch = UNICODE(@ch1)
SET @tch = @ch - 0xd800
IF @tch < 0 SET @tch = @tch + 2147483648
IF @tch < 2048
BEGIN
SET @ch = @ch * 1024
SET @index = @index + 1
IF @index >= @valueEnd RETURN -1
SET @ch2 = SUBSTRING(@value,@index,1);
SET @ch = @ch + UNICODE(@ch2) - 0x35fdc00
END
END
END
END
SET @accept = -1
SELECT @accept = [dbo].[ExampleState].[SymbolId] FROM [dbo].[ExampleState] WHERE [dbo].[ExampleState].[SymbolId] = @symbolId AND [dbo].[ExampleState].[StateId] = @state AND [dbo].[ExampleState].[BlockEndId] = -1 AND [dbo].[ExampleState].[Accepts]=1
IF @accept <> -1
BEGIN
SELECT TOP 1 @blockEndId = [dbo].[ExampleState].[BlockEndId] FROM [dbo].[ExampleState] WHERE [dbo].[ExampleState].[SymbolId]=@symbolId AND [dbo].[ExampleState].[BlockEndId] <> -1
IF @blockEndId <> -1
BEGIN
SET @result = 0
SET @state = 0
WHILE @ch <> -1
BEGIN
SET @done = 0
WHILE @done = 0
BEGIN
SET @done = 1
SET @toState = -1
SELECT @toState = [dbo].[ExampleStateTransition].[ToStateId] FROM [dbo].[ExampleState] INNER JOIN [dbo].[ExampleStateTransition] ON [dbo].[ExampleState].[StateId]=[dbo].[ExampleStateTransition].[StateId] AND [dbo].[ExampleState].[SymbolId]=[dbo].[ExampleStateTransition].[SymbolId] AND [dbo].[ExampleStateTransition].[BlockEndId]=@blockEndId WHERE [dbo].[ExampleState].[SymbolId]=@symbolId AND [dbo].[ExampleState].[StateId]=@state AND [dbo].[ExampleState].[BlockEndId] = @blockEndId AND @ch BETWEEN [dbo].[ExampleStateTransition].[Min] AND [dbo].[ExampleStateTransition].[Max]
IF @toState <> -1
BEGIN
SET @done = 0
SET @state = @toState
SET @capture = @capture + @ch1
IF @tch < 2048 SET @capture = @capture + @ch2
SET @index = @index + 1
IF @index >= @valueEnd
BEGIN
SET @ch = -1
SET @done = 1
END
ELSE
BEGIN
SET @ch1 = SUBSTRING(@value,@index,1)
SET @ch = UNICODE(@ch1)
SET @tch = @ch - 0xd800
IF @tch < 0 SET @tch = @tch + 2147483648
IF @tch < 2048
BEGIN
SET @ch = @ch * 1024
SET @index = @index + 1
IF @index >= @valueEnd RETURN -1
SET @ch2 = SUBSTRING(@value,@index,1);
SET @ch = @ch + UNICODE(@ch2) - 0x35fdc00
END
END
END
END
SET @accept = -1
SELECT @accept = [dbo].[ExampleState].[SymbolId] FROM [dbo].[ExampleState] WHERE [dbo].[ExampleState].[SymbolId] = @symbolId AND [dbo].[ExampleState].[StateId] = @state AND [dbo].[ExampleState].[BlockEndId] = @blockEndId AND [dbo].[ExampleState].[Accepts]=1
IF @accept <> -1
BEGIN
INSERT INTO #Results SELECT @position AS [Position], @capture AS [Value], DATALENGTH(@capture)/2 as [Length]
SET @state = 0
BREAK
END
ELSE
BEGIN
SET @capture = @capture + @ch1
IF @tch < 2048 SET @capture = @capture + @ch2
SET @index = @index + 1
IF @index >= @valueEnd
BEGIN
SET @ch = -1
SET @done = 1
END
ELSE
BEGIN
SET @ch1 = SUBSTRING(@value,@index,1)
SET @ch = UNICODE(@ch1)
SET @tch = @ch - 0xd800
IF @tch < 0 SET @tch = @tch + 2147483648
IF @tch < 2048
BEGIN
SET @ch = @ch * 1024
SET @index = @index + 1
IF @index >= @valueEnd RETURN -1
SET @ch2 = SUBSTRING(@value,@index,1);
SET @ch = @ch + UNICODE(@ch2) - 0x35fdc00
END
END
END
SET @state = 0
END
SET @state = 0
CONTINUE
END
ELSE
BEGIN
SET @len = DATALENGTH(@capture)/2
IF(@len>0) INSERT INTO #Results SELECT @position AS [Position], @capture AS [Value], @len as [Length]
END
END
SET @index = @index + 1
IF @index >= @valueEnd
BEGIN
SET @ch = -1
SET @done = 1
END
ELSE
BEGIN
SET @ch1 = SUBSTRING(@value,@index,1)
SET @ch = UNICODE(@ch1)
SET @tch = @ch - 0xd800
IF @tch < 0 SET @tch = @tch + 2147483648
IF @tch < 2048
BEGIN
SET @ch = @ch * 1024
SET @index = @index + 1
IF @index >= @valueEnd RETURN -1
SET @ch2 = SUBSTRING(@value,@index,1);
SET @ch = @ch + UNICODE(@ch2) - 0x35fdc00
END
END
END
SELECT * FROM #Results
DROP TABLE #Results
END
Real programmers use butterflies
|
|
|
|
|
Better to post an article, so it doesn't get lost in the limbo of the lounge past pages...
M.D.V.
If something has a solution... Why do we have to worry about?. If it has no solution... For what reason do we have to worry about?
Help me to understand what I'm saying, and I'll explain it better to you
Rating helpful answers is nice, but saying thanks can be even nicer.
|
|
|
|
|
I will be. The code is half done. I just got it tokenizing using tables in SQL! I'll be putting in non-table versions (compiled into stored procs) and then I'll update my Reggie article with it.
Gosh, when this is done, it won't be too hard to build parsers into SQL. Talk about normalizing content - give it a grammar and it can take your text fields and parse them into trees. That's actually pretty useful when you need to submit complicated content. You could easily make the DB normalize JSON values with such a beast. That's the next thing after Reggie: Norm - the data normalizing parser for C#, databases and beyond
Real programmers use butterflies
|
|
|
|
|
You should permalink this thread and ask @Sean-Ewington or @Chris-Maunder to bring some conversations to the message board of the article once published. There are interesting comments in them.
M.D.V.
If something has a solution... Why do we have to worry about?. If it has no solution... For what reason do we have to worry about?
Help me to understand what I'm saying, and I'll explain it better to you
Rating helpful answers is nice, but saying thanks can be even nicer.
|
|
|
|
|
Argh this code is a bear. I had the C# stuff working right, and then I tuned the database code to match it. Then I decided to add AbsolutePosition and AbsoluteLength properties so you could ALSO get the position and length in native characters instead of just UTF32 codepoints (it matters)
Then as I added that code, my positioning code started to break. I added a couple of minor hacks that hopefully evened it out, and then my error handling code isn't quite working right in any of them anymore.
For a run of errors like "...." I should be reporting 1 error if "...." instead of 4 errors of "." each
I'm not sure where things went south, as the C# version worked at one point.
Real programmers use butterflies
|
|
|
|
|
We can certainly move this thread to the article but I think the article should start fresh
cheers
Chris Maunder
|
|
|
|
|
Sorry. I get carried away with this stuff sometimes. My hubby hates it.
Real programmers use butterflies
|
|
|
|
|
Never, ever apologise for getting carried away with code!
It's awesome. Truly awesome.
cheers
Chris Maunder
|
|
|
|
|
I'm dying to make this update to Reggie. It's worth at least two articles - one for the SQL targeting alone.
It's just this bloody error handling, and then backporting any changes I make to this C# code to the relevant templates used to generate it (both for it and for SQL, and there are two implementations for each target - one for tables and one for compiled - meaning 2x2 = 4 different places i need to alter the code templates)
Even with my tools to ease maintenance this project is getting a little too big for me.
Just wait til i make your RDBMS normalize structured text like JSON or XML or submitted to stored procedures. Parsing's coming text, once I have a good tokenizer. The parser's called Norm, because it's my data "normalizer"
Eventually I intend to target JS, C++, python, PHP and maybe Java or something, but it depends if I can get any help.
Reggie and Norm will make triple tier validation for all kinds of content possible, and then also so much more than that.
Real programmers use butterflies
modified 31-Oct-21 13:19pm.
|
|
|
|
|
Python, eh?
Keep me posted.
cheers
Chris Maunder
|
|
|
|
|
Oh you know I will. I don't even like python but a lot of people do so I figure it's probably worth targeting, so it's worth teaching myself a little more of it - right now I can read it but not write it.
I think in the end what I want is something you can use to generate validation code for any kind of middleware platform, as well as front ends and back ends.
Real programmers use butterflies
|
|
|
|
|
So, um, probably a dumb question. Why not use SQL's built in regex capability?
SELECT * FROM #Sample WHERE Field LIKE '%[^a-z0-9 .]%'
|
|
|
|
|
It's not full regex, and works very badly, plus its practical unicode support (such as having character classes for letters and numbers) is dodgy if it even has it at all.
It's not really regex. LIKE is simple pattern matching closer to glorified dos filename wildcards than anything regex-like - though I haven't checked if they've improved on it since say SQL2000
Also with this tool, you can have a single spec file that has the same regex's for your C# and SQL code (also potentially other targets like JS)
Real programmers use butterflies
|
|
|
|
|
|
Can it return a table? Can it return XML?
I have a few Regex CLR functions, from a simple IsMatch to one which is table-valued and returns all the groups and such.
|
|
|
|
|
The procedure returns resultsets based on matches. You can insert those into a table With INSERT INTO ... EXEC / You can do tokenization too, which returns tokens, that would give you something like your "groups" where it can distinguish between an int field and a literal string for example. IsXXXX indicates that the entire target field matches the expression. MatchXXXX finds all occurrences. Tokenize tokenizes (returning all tokens) - you typically use that for parsing.
Since the data is flat, there's no sense in returning XML. With a big *however*
When I build out Norm, my latest parser generator that can also target SQL, it will return hierarchical resultsets in the format consumable by SQLXML (constructed with OUTER JOIN s, special column names, and a parent id to impose a hierarchy so you can get full parse tables as XML. Parse JSON from your database if you like. This will do it, and without using any particularly fancy features. It's "flavored" for MSSQL right now but the templates can be extended easily to support MySQL, Postgre and Oracle since I've been careful to avoid most of the extended database features.
Real programmers use butterflies
|
|
|
|
|
Here's an example:
SELECT * FROM Library.dbo.RegEx ( 'foo=bar fizz=buzz' , '(?''Name''\w+)=(?''Value''\w+)' )
Index Match Groups
0 foo=bar <Groups><Group Name="0" Success="True" Offset="0" Length="7">foo=bar</Group><Group Name="Name" Success="True" Offset="0" Length="3">foo</Group><Group Name="Value" Success="True" Offset="4" Length="3">bar</Group></Groups>
1 fizz=buzz <Groups><Group Name="0" Success="True" Offset="8" Length="9">fizz=buzz</Group><Group Name="Name" Success="True" Offset="8" Length="4">fizz</Group><Group Name="Value" Success="True" Offset="13" Length="4">buzz</Group></Groups>
modified 30-Oct-21 21:42pm.
|
|
|
|
|
Yeah, that's poor man's tokenizing. To be honest, .NET's regex engine is crap at it, just because they didn't do the minor work necessary to implement the feature (it's a slight "hack" or rather "twist" on a a|b|c such that each expression a,b, and c has a symbol id associated with it, and the engine will tell you which it matched. It goes through the text beginning to end, reporting all matches like that, one row in the table for each match..
Your input spec might look something like this (.rl format)
VerbatimStringLiteral= '@"([^"]|"")*"'
StringLiteral='"([^"]|\\.)*"'
CharacterLiteral= '[\']([^\']|\\.)([\'])'
IntegerLiteral= '(0x[0-9A-Fa-f]{1,16}|([0-9]+))([Uu][Ll]?|[Ll][Uu]?)?'
FloatLiteral= '(([0-9]+)(\.[0-9]+)?([Ee][+-]?[0-9]+)?[DdMmFf]?)|((\.[0-9]+)([Ee][+-]?[0-9]+)?[DdMmFf]?)'
// the following takes a long time to generate
//Keyword = 'abstract|as|base|bool|break|byte|case|catch|char|checked|class|const|continue|decimal|default|delegate|do|double|else|enum|event|explicit|extern|false|finally|fixed|float|for|foreach|goto|if|implicit|in|int|interface|internal|is|lock|long|namespace|new|null|object|operator|out|override|params|private|protected|public|readonly|ref|return|sbyte|sealed|short|sizeof|stackalloc|static|string|struct|switch|this|throw|true|try|typeof|uint|ulong|unchecked|unsafe|ushort|using|virtual|void|volatile|while'
Whitespace<hidden>='[\t\r\n\v\f ]+'
Identifier='[_[:IsLetter:]][_[:IsLetterOrDigit:]]*'
CommentBlock<id=40,blockEnd="*/">="/*"
//Bar="bar"
Forgive the word wrapping but it's a line based grammar.
So if you tokenize something something by calling Tokenize you get back a row for each match and what Symbol it was plus where it was in the document and its actual value (like CommentBlock at position 3, value "/* bar */") and you get many of those for a potential string.
Real programmers use butterflies
|
|
|
|
|
I am working on a Kotlin app that needs to load a large file at startup. Because it can take up to 20 seconds it must be done in a separate thread so it won't block the UI thread. When it is loaded I need to send a signal to the main UI thread, so that the main thread will know the file is ready for further processing.
Being a Kotlin greenhorn, I battled for hours to send a signal from the background thread to the main thread, with no luck. Then I remembered reading something about a built-in Kotlin method :
runOnUiThread() . "Surely it cannot be that simple", I thought. But in desperation I tried adding this statement to the end of the background thread:
runOnUiThread { someMethodOnUiThread() } And voila! it worked. Apparently this little gem inserts the requested method into the execution queue of the UI thread.
Sometimes we look for complicated solutions when a perfect simple solution is right under our noses.
By the way: Please don't tell me I should have used a Kotlin coroutine. I tried that, but when you use a file reading class like ObjectInputStream the coroutine starts blocking the UI thread.
Get me coffee and no one gets hurt!
modified 30-Oct-21 15:09pm.
|
|
|
|
|
I thought that's what postMessage does - posts a message onto the UI thread.
Window.postMessage() - Web APIs | MDN[^]
But then again, I don't write apps in Kotlin so I needed something more native, using the Worker class Using Web Workers - Web APIs | MDN[^] and various machinations to deal with the fact that I didn't want external files for the source for the worker.
|
|
|
|
|
Thanks. I may just try that for fun!
Get me coffee and no one gets hurt!
|
|
|
|
|
Quote: coroutine starts blocking the UI thread I believe that you can set the dispatcher context of the coroutine thread so that it runs on a thread other than the UI thread.
Dispatchers in Kotlin Coroutines - GeeksforGeeks[^]
That said if your solution works then that's fine as far as I am concerned(I am from the school of software needs to work rather than it needs to be beautifully architected but not work) - Kotlin still seems to have a few quirks with coroutines.
“That which can be asserted without evidence, can be dismissed without evidence.”
― Christopher Hitchens
|
|
|
|
|
My experience with coroutines is that they run in non-blocking mode, until you start doing disk io. Then they block the UI thread. I have also seen a number of posts confirming this behavior. I think it was on Stackoverflow.
Get me coffee and no one gets hurt!
|
|
|
|
|
go with what works.
I would have thought that disk IO would occupy the IO thread but as you hint coroutines can be a bit weird.
“That which can be asserted without evidence, can be dismissed without evidence.”
― Christopher Hitchens
|
|
|
|
|
*sigh* - (half serious here) what even is the point of UTF16? You still have the possibility of encountering surrogate characters, which means if you want to support unicode streams you have to handle that possibility.
MSSQL supports UTF16, but not UTF32
Consequently, here's me fetching the next character of an NVARCHAR(x) or NTEXT stream, with UTF32 support. Forgive the grotty code, as there is no unsigned data types and no bit shifts, etc
DECLARE @valueEnd INT = DATALENGTH(@value)/2+1
DECLARE @index INT = 1
DECLARE @ch BIGINT
DECLARE @tch BIGINT
...
SET @ch = UNICODE(SUBSTRING(@value,@index,1))
SET @tch = @ch - 0xd800
IF @tch < 0 SET @tch = @tch + 2147483648
IF @tch < 2048
BEGIN
SET @ch = @ch * 1024
SET @index = @index + 1
IF @index >= @valueEnd RETURN -1
SET @ch = @ch + UNICODE(SUBSTRING(@value,@index,1)) - 0x35fdc00
END
This is hateful and slow. I haven't even tested it yet. I know it doesn't gracefully handle any and all invalid unicode streams but to make it do that is even worse. To heck with this.
Why UTF16 with no functions to convert a surrogate pair to UTF32? it's ridic.
Someone asked me the other day why I don't like Microsoft SQL Server. Here's reason #1359
Bad MSSQL! BAD!
Real programmers use butterflies
|
|
|
|
|