ITokenizer
ITokenizer dismantles characters into token objects.
/// <summary>Indicates implements tokenizer</summary>
public interface ITokenizerBase { }
/// <summary>Dismantles tokens from char memory.</summary>
public interface ITokenizer<T> : ITokenizerBase where T : IToken
{
/// <summary>Peek to test whether <paramref name="text"/> starts with <typeparamref name="T"/>.</summary>
bool Peek(ReadOnlyMemory<char> text);
/// <summary>Try take a <typeparamref name="T"/> token from <paramref name="text"/>.</summary>
bool TryTake(ReadOnlyMemory<char> text, out T token);
}
.TryTake(memory, out token) tries to take next token. IntegerTokenizer tokenizes integer characters. .SliceAfter(arg) is extension method in Avalanche.Utilities.dll that returns the Memory region after 'arg'.
// Text to tokenizer
String @string = "1, x, 3, 4, 5, x, 7";
ReadOnlyMemory<char> text = @string.AsMemory();
// Get tokenizer from singleton
var valueTokenizer = IntegerTokenizer.Instance;
// Try take token
if (valueTokenizer.TryTake(text, out DecimalToken token0))
{
// Print token
WriteLine(token0); // "[0:1] DecimalToken "1""
// Slice
text = text.SliceAfter(token0.Memory);
}
ConstantTokenizer<T>(string) tokenizes only specific a string as T.
// Create comma tokenizer
var commaTokenizer = new ConstantTokenizer<SeparatorToken>(",");
// Try take ','
if (commaTokenizer.TryTake(text, out SeparatorToken token2))
{
// Print token
WriteLine(token2);
// Slice
text = text.SliceAfter(token2.Memory); // "[1:2] SeparatorToken ",""
}
WhitespaceTokenizer.Any tokenizes all white-spaces, and WhitespaceTokenizer.AllButNewLine all but new line into WhitespaceToken
// Try take white-space
if (WhitespaceTokenizer.Any.TryTake(text, out WhitespaceToken token1))
{
// Print token
WriteLine(token1); // "[2:3] WhitespaceToken " ""
// Slice
text = text.SliceAfter(token1.Memory);
}
UntilTokenizer(endCondition) tokenizes characters until end of stream or until end condition is found. End condition is determined by another tokenizer.
// Create malformed tokenizer
var malformedTokenizer = new UntilTokenizer<MalformedToken>(new AnyTokenizer(new ConstantTokenizer(","), WhitespaceTokenizer.Any));
// Try take 'x'
if (malformedTokenizer.TryTake(text, out MalformedToken token3))
{
// Print token
WriteLine(token3); // "[3:4] MalformedToken "x""
// Slice
text = text.SliceAfter(token3.Memory);
}
.Take<T>() returns token or null.
// Take ','
IToken token4 = commaTokenizer.Take<IToken>(text)!;
text = text.SliceAfter(token4.Memory);
// Take ' '
IToken token5 = WhitespaceTokenizer.Any.Take<IToken>(text)!;
text = text.SliceAfter(token5.Memory);
SequenceTokenizer( (tokenizer, required, yieldChildren), ... ) tokenizes a sequence of sub-tokenizers.
// Create " , x " slot tokenizer
var slotTokenizer = new SequenceTokenizer<ValueToken>(
(WhitespaceTokenizer.Any, false, false),
(commaTokenizer, false, false),
(WhitespaceTokenizer.Any, false, false),
(new AnyTokenizer(valueTokenizer, malformedTokenizer), true, true),
(WhitespaceTokenizer.Any, false, false)
);
// Try take '3 '
if (slotTokenizer.TryTake(text, out IToken token6))
{
// Print token
WriteLine(token6); // "[6:7] ValueToken "3""
// Slice
text = text.SliceAfter(token6.Memory);
}
WhileTokenizer(subTokenizer, yieldChildren) repeats while sub-tokenizer provides tokens.
// Put together a while tokenizer that takes all integer/malformed parts and repeats while content lasts
var whileTokenizer = new WhileTokenizer(slotTokenizer, false);
// Try take all
if (whileTokenizer.TryTake("1, x, 3, 4".AsMemory(), out IToken tokenAll)) WriteLine(tokenAll.PrintTree());
Tokenizes remaining string "1, x, 3, 4" into following tree.
CompositeToken: "1, x, 3, 4" ├── ValueToken: "1" │ └── DecimalToken: "1" ├── ValueToken: ", x" │ ├── SeparatorToken: "," │ ├── WhitespaceToken: " " │ └── MalformedToken: "x" ├── ValueToken: ", 3" │ ├── SeparatorToken: "," │ ├── WhitespaceToken: " " │ └── DecimalToken: "3" └── ValueToken: ", 4" ├── SeparatorToken: "," ├── WhitespaceToken: " " └── DecimalToken: "4"
.PrintTree() prints tokenizer composition as a tree.
// Print tokenizer as tree
WriteLine(whileTokenizer.PrintTree());
WhileTokenizer { YieldChildren = False } └── ElementTokenizer = SequenceTokenizer├── Tokenizers[0] = WhitespaceTokenizer { IncludeNewLine = False } ├── Tokenizers[1] = ConstantTokenizer { Text = "," } ├── Tokenizers[2] = WhitespaceTokenizer { IncludeNewLine = False } ├── Tokenizers[3] = AnyTokenizer │ ├── Tokenizers[0] = IntegerTokenizer { NumberFormat = System.Globalization.NumberFormatInfo } │ └── Tokenizers[1] = UntilTokenizer { EndsWithEndCondition = False } │ └── EndConditionTokenizer = AnyTokenizer │ ├── Tokenizers[0] = ConstantTokenizer { Text = "," } │ └── Tokenizers[1] = WhitespaceTokenizer { IncludeNewLine = False } └── Tokenizers[4] = WhitespaceTokenizer { IncludeNewLine = False }
.VisitTree() visits each tokenizer.
foreach (var line in whileTokenizer.VisitTree())
WriteLine(line);
.TakeAll<T> takes whole string as one token, or throws InvalidOperationException if could not tokenize all content.
IToken compositeToken = whileTokenizer.TakeAll<IToken>("1, x, 3, 4, 5, x, 7");
WriteLine(compositeToken.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
[0:19] CompositeToken: "1, x, 3, 4, 5, x, 7" ├── [0:1] ValueToken: "1" │ └── [0:1] DecimalToken: "1" ├── [1:4] ValueToken: ", x" │ ├── [1:2] SeparatorToken: "," │ ├── [2:3] WhitespaceToken: " " │ └── [3:4] MalformedToken: "x" ├── [4:7] ValueToken: ", 3" │ ├── [4:5] SeparatorToken: "," │ ├── [5:6] WhitespaceToken: " " │ └── [6:7] DecimalToken: "3" ├── [7:10] ValueToken: ", 4" │ ├── [7:8] SeparatorToken: "," │ ├── [8:9] WhitespaceToken: " " │ └── [9:10] DecimalToken: "4" ├── [10:13] ValueToken: ", 5" │ ├── [10:11] SeparatorToken: "," │ ├── [11:12] WhitespaceToken: " " │ └── [12:13] DecimalToken: "5" ├── [13:16] ValueToken: ", x" │ ├── [13:14] SeparatorToken: "," │ ├── [14:15] WhitespaceToken: " " │ └── [15:16] MalformedToken: "x" └── [16:19] ValueToken: ", 7" ├── [16:17] SeparatorToken: "," ├── [17:18] WhitespaceToken: " " └── [18:19] DecimalToken: "7"
Implementation
TokenizerBase<T> is base class for most tokenizers. It implementations ITokenizer<IToken>, ITokenizer<T>, Peek.
/// <summary>Tokenizes letters and digits as <see cref="IdentifierToken"/>.</summary>
public class IdentifierTokenizer : TokenizerBase<IdentifierToken>
{
/// <summary>Singleton</summary>
static IdentifierTokenizer instance = new IdentifierTokenizer();
/// <summary>Singleton</summary>
public static IdentifierTokenizer Instance => instance;
public override bool TryTake(ReadOnlyMemory<char> text, out IdentifierToken token)
{
// Get span
ReadOnlySpan<char> span = text.Span;
// Accepted chars
int ix = 0;
//
for (int i=0; i<span.Length; i++)
{
// Not letter
if (!char.IsLetterOrDigit(span[i])) break;
// Accept char
ix++;
}
// No chars were accepted
if (ix == 0) { token = default!; return false; }
// Return result
token = new IdentifierToken { Memory = text.Slice(0, ix) };
return true;
}
}
// Text to tokenize
ReadOnlyMemory<char> text = "ID001, ID002, ID003".AsMemory();
// Get token
if (IdentifierTokenizer.Instance.TryTake(text, out IdentifierToken identifierToken))
{
// Slice text
text = text.SliceAfter(identifierToken.Memory);
//
WriteLine(identifierToken); // '[0:5] IdentifierToken "ID001"'
}
Full Example
Full example
using Avalanche.Tokenizer;
using Avalanche.Utilities;
using static System.Console;
public class tokenizer
{
public static void Run()
{
{
// <01>
// Text to tokenizer
String @string = "1, x, 3, 4, 5, x, 7";
ReadOnlyMemory<char> text = @string.AsMemory();
// Get tokenizer from singleton
var valueTokenizer = IntegerTokenizer.Instance;
// Try take token
if (valueTokenizer.TryTake(text, out DecimalToken token0))
{
// Print token
WriteLine(token0); // "[0:1] DecimalToken "1""
// Slice
text = text.SliceAfter(token0.Memory);
}
// </01>
// <02>
// Create comma tokenizer
var commaTokenizer = new ConstantTokenizer<SeparatorToken>(",");
// Try take ','
if (commaTokenizer.TryTake(text, out SeparatorToken token2))
{
// Print token
WriteLine(token2);
// Slice
text = text.SliceAfter(token2.Memory); // "[1:2] SeparatorToken ",""
}
// </02>
// <03>
// Try take white-space
if (WhitespaceTokenizer.Any.TryTake(text, out WhitespaceToken token1))
{
// Print token
WriteLine(token1); // "[2:3] WhitespaceToken " ""
// Slice
text = text.SliceAfter(token1.Memory);
}
// </03>
// <04>
// Create malformed tokenizer
var malformedTokenizer = new UntilTokenizer<MalformedToken>(new AnyTokenizer(new ConstantTokenizer(","), WhitespaceTokenizer.Any));
// Try take 'x'
if (malformedTokenizer.TryTake(text, out MalformedToken token3))
{
// Print token
WriteLine(token3); // "[3:4] MalformedToken "x""
// Slice
text = text.SliceAfter(token3.Memory);
}
// </04>
// <05>
// Take ','
IToken token4 = commaTokenizer.Take<IToken>(text)!;
text = text.SliceAfter(token4.Memory);
// Take ' '
IToken token5 = WhitespaceTokenizer.Any.Take<IToken>(text)!;
text = text.SliceAfter(token5.Memory);
// </05>
// <06>
// Create " , x " slot tokenizer
var slotTokenizer = new SequenceTokenizer<ValueToken>(
(WhitespaceTokenizer.Any, false, false),
(commaTokenizer, false, false),
(WhitespaceTokenizer.Any, false, false),
(new AnyTokenizer(valueTokenizer, malformedTokenizer), true, true),
(WhitespaceTokenizer.Any, false, false)
);
// Try take '3 '
if (slotTokenizer.TryTake(text, out IToken token6))
{
// Print token
WriteLine(token6); // "[6:7] ValueToken "3""
// Slice
text = text.SliceAfter(token6.Memory);
}
// </06>
// <07>
// Put together a while tokenizer that takes all integer/malformed parts and repeats while content lasts
var whileTokenizer = new WhileTokenizer(slotTokenizer, false);
// Try take all
if (whileTokenizer.TryTake("1, x, 3, 4".AsMemory(), out IToken tokenAll)) WriteLine(tokenAll.PrintTree());
// </07>
// <08>
// Print tokenizer as tree
WriteLine(whileTokenizer.PrintTree());
// </08>
// <09>
foreach (var line in whileTokenizer.VisitTree())
WriteLine(line);
// </09>
// <10>
IToken compositeToken = whileTokenizer.TakeAll<IToken>("1, x, 3, 4, 5, x, 7");
WriteLine(compositeToken.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
// </10>
}
{
// <98>
// Text to tokenize
ReadOnlyMemory<char> text = "ID001, ID002, ID003".AsMemory();
// Get token
if (IdentifierTokenizer.Instance.TryTake(text, out IdentifierToken identifierToken))
{
// Slice text
text = text.SliceAfter(identifierToken.Memory);
//
WriteLine(identifierToken); // '[0:5] IdentifierToken "ID001"'
}
// </98>
}
}
// <99>
/// <summary>Tokenizes letters and digits as <see cref="IdentifierToken"/>.</summary>
public class IdentifierTokenizer : TokenizerBase<IdentifierToken>
{
/// <summary>Singleton</summary>
static IdentifierTokenizer instance = new IdentifierTokenizer();
/// <summary>Singleton</summary>
public static IdentifierTokenizer Instance => instance;
public override bool TryTake(ReadOnlyMemory<char> text, out IdentifierToken token)
{
// Get span
ReadOnlySpan<char> span = text.Span;
// Accepted chars
int ix = 0;
//
for (int i=0; i<span.Length; i++)
{
// Not letter
if (!char.IsLetterOrDigit(span[i])) break;
// Accept char
ix++;
}
// No chars were accepted
if (ix == 0) { token = default!; return false; }
// Return result
token = new IdentifierToken { Memory = text.Slice(0, ix) };
return true;
}
}
// </99>
}