Tokenizers
AllTokenizer takes all remaining characters.
var tokenizer = AllTokenizer<TextToken>.Instance;
TextToken token = tokenizer.Take<TextToken>("123456");
// [0:6] TextToken: "123456"
AnyTokenizer(tokenizers) takes with any of the sub-tokenizers.
var tokenizer = new AnyTokenizer<TextToken>(IntegerTokenizer.Instance, WhitespaceTokenizer.Any);
TextToken token = tokenizer.Take<TextToken>("1234567890");
// [0:10] TextToken: "1234567890"
CharTokenizer(Func<Memory,bool>) uses delegate to evaluate which characters to take.
var tokenizer = new CharTokenizer<TextToken>(mem => mem.Span[0] == '_');
TextToken token = tokenizer.Take<TextToken>("____abc");
// [0:4] TextToken: "____"
ConstantTokenizer(string) takes a specific string.
var tokenizer = new ConstantTokenizer<SeparatorToken>(",");
SeparatorToken token = tokenizer.Take<SeparatorToken>(",x");
// [0:1] SeparatorToken: ","
FuncTokenizer(Func<Memory, Token>) adapts func into tokenizer.
var tokenizer = new FuncTokenizer<TextToken>(mem => new TextToken { Memory = mem.Slice(0, 3) });
TextToken token = tokenizer.Take<TextToken>("____abc");
// [0:3] TextToken: "___"
Tokenizers.Func(Func<Memory, Token>) adapts func into tokenizer.
var tokenizer = Tokenizers.Func<TextToken>(mem => new TextToken { Memory = mem.Slice(0, 3) });
TextToken token = tokenizer.Take<TextToken>("____abc");
// [0:3] TextToken: "___"
HexTokenizer.WithoutPrefix tokenizes hexadecimal string.
var tokenizer = HexTokenizer<ValueToken>.WithoutPrefix;
IToken token = tokenizer.Take<IToken>("0123456789ABCDEF")!;
// [0:16] ValueToken: "0123456789ABCDEF"
HexTokenizer.WithPrefix tokenizes hexadecimal string with '0x' prefix.
var tokenizer = HexTokenizer<DecimalToken>.WithPrefix;
IToken token = tokenizer.Take<IToken>("0x0123456789ABCDEF")!;
// [0:18] DecimalToken: "0x0123456789ABCDEF"
IntegerTokenizer.Instance tokenizes integer decimals characters.
var tokenizer = IntegerTokenizer<ValueToken>.Instance;
IToken token = tokenizer.Take<IToken>("0123456789")!;
// [0:10] ValueToken: "0123456789"
MalformedTokenizer.Instance takes one following character. Typically used as last case in AnyTokenizer.
var tokenizer = MalformedTokenizer<MalformedToken>.Instance;
IToken token = tokenizer.Take<IToken>("¤§")!;
// [0:1] MalformedToken: "¤"
NewLineTokenizer.Instance tokenizes new line character '\n'.
var tokenizer = NewLineTokenizer<NewLineToken>.Instance;
IToken token = tokenizer.Take<IToken>("\n\n\n")!;
// [0:1] NewLineToken: "\n"
RealTokenizer.Instance tokenies real number.
var tokenizer = RealTokenizer<DecimalToken>.Instance;
IToken token = tokenizer.Take<IToken>("-123.45600e-12 asdf")!;
// [0:14] DecimalToken: "-123.45600e-12"
RegexTokenizer tokenizes using regex pattern. Note that pattern typically starts with "^" to indicate beginning characters.
var tokenizer = new RegexTokenizer<TextToken>("^[a-zA-Z0-9]+");
IToken token = tokenizer.Take<IToken>("ab12 cd34")!;
// [0:4] TextToken: "ab12"
SequenceTokenizer(tokenizers) tokenizes a specific sequence of sub-tokenizers.
var tokenizer = new SequenceTokenizer<CompositeToken>(
IntegerTokenizer.Instance,
new ConstantTokenizer<SeparatorToken>("="),
IntegerTokenizer.Instance
);
IToken token = tokenizer.Take<IToken>("10=20")!;
//[0:5] CompositeToken: "10=20"
//├── [0:2] DecimalToken: "10"
//├── [2:3] SeparatorToken: "="
//└── [3:5] DecimalToken: "20"
SequenceTokenizer( (tokenizer, bool required)[] ) tokenizes a specific sequence of sub-tokenizers. Sub-tokenizer is optional.
var tokenizer = new SequenceTokenizer<CompositeToken>(
(WhitespaceTokenizer.Any, false),
(IntegerTokenizer.Instance, true),
(WhitespaceTokenizer.Any, false),
(new ConstantTokenizer<SeparatorToken>("="), true),
(WhitespaceTokenizer.Any, false),
(IntegerTokenizer.Instance, true),
(WhitespaceTokenizer.Any, false)
);
IToken token = tokenizer.Take<IToken>("10=20")!;
//[0:5] CompositeToken: "10=20"
//├── [0:2] DecimalToken: "10"
//├── [2:3] SeparatorToken: "="
//└── [3:5] DecimalToken: "20"
SequenceTokenizer( (tokenizer, bool required, bool yieldChildren)[] ) tokenizes a specific sequence of sub-tokenizers. Sub-tokenizer is optional. Can yield children of child tokens.
var commaTokenizer = new ConstantTokenizer<OperandToken>(",");
var anyTokenizer = new AnyTokenizer(commaTokenizer, new UntilTokenizer(commaTokenizer));
var tokenizer =
new SequenceTokenizer<CompositeToken>(
(WhitespaceTokenizer.Any, false, false),
(anyTokenizer, true, true),
(WhitespaceTokenizer.Any, false, false),
(anyTokenizer, true, true),
(WhitespaceTokenizer.Any, false, false)
);
IToken token = tokenizer.Take<IToken>("A,B")!;
//[0:2] CompositeToken: "A,"
//├── [0:1] TextToken: "A"
//└── [1:2] OperandToken: ","
TrueTokenizer decorates tokenizer so that it always returns true, even to empty token.
var tokenizer = new TrueTokenizer<TextToken>(new RegexTokenizer<TextToken>("^[a-zA-Z0-9]?"));
TextToken token = tokenizer.Take<TextToken>("");
// [0:0] TextToken: ""
UntilTokenizer(tokenizer) tokenizes characters until end condition is met. End condition is another tokenizer. End condition is excluded from resulted token.
var tokenizer = new UntilTokenizer<CompositeToken>(NewLineTokenizer.Instance);
IToken token = tokenizer.Take<IToken>("First line\nSecond line\nThird line")!;
// [0:10] CompositeToken: "First line"
UntilTokenizer(tokenizer, endOfSpan, char escapeChar) tokenizes characters until end condition is met. Until tokenize can be used as escape aware.
var tokenizer = new UntilTokenizer(new ConstantTokenizer(" "), false, '\\');
IToken token = tokenizer.Take<IToken>(@"a\ b\ c d e f")!;
// [0:7] TextToken: "a\\ b\\ c"
WhileTokenizer(tokenizer, bool yieldChildren) tokenizes while its sub-tokenizer yields tokens.
var tokenizer = new WhileTokenizer<CompositeToken>(
new AnyTokenizer(
new RegexTokenizer<TextToken>("^[a-zA-Z0-9]+"),
WhitespaceTokenizer.AllButNewLine,
NewLineTokenizer.Instance
),
yieldChildren: true
);
CompositeToken token = tokenizer.TakeAll<CompositeToken>("Hello world\nabc\n123");
// [0:19] CompositeToken: "Hello world\nabc\n123"
// ├── [0:5] TextToken: "Hello"
// ├── [5:6] WhitespaceToken: " "
// ├── [6:11] TextToken: "world"
// ├── [11:12] NewLineToken: "\n"
// ├── [12:15] TextToken: "abc"
// ├── [15:16] NewLineToken: "\n"
// └── [16:19] TextToken: "123"
WhitespaceTokenizer.Any tokenizes white-space characters.
var tokenizer = WhitespaceTokenizer.Any;
IToken token = tokenizer.Take<WhitespaceToken>(" \t\t \n ABC");
// [0:9] WhitespaceToken: " \n "
WhitespaceTokenizer.AllButNewLine tokenizes white-space characters except new-line '\n'.
var tokenizer = WhitespaceTokenizer.AllButNewLine;
IToken token = tokenizer.Take<WhitespaceToken>(" \t\t \n ABC");
// [0:6] WhitespaceToken: " "
Full Example
Full example
using Avalanche.Tokenizer;
public class tokenizers
{
public static void Run()
{
{
// <01>
var tokenizer = AllTokenizer<TextToken>.Instance;
TextToken token = tokenizer.Take<TextToken>("123456");
// [0:6] TextToken: "123456"
// </01>
Console.WriteLine(token.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
}
{
// <02>
var tokenizer = new AnyTokenizer<TextToken>(IntegerTokenizer.Instance, WhitespaceTokenizer.Any);
TextToken token = tokenizer.Take<TextToken>("1234567890");
// [0:10] TextToken: "1234567890"
// </02>
Console.WriteLine(token.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
}
{
// <03>
var tokenizer = new CharTokenizer<TextToken>(mem => mem.Span[0] == '_');
TextToken token = tokenizer.Take<TextToken>("____abc");
// [0:4] TextToken: "____"
// </03>
Console.WriteLine(token.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
}
{
// <04>
var tokenizer = new ConstantTokenizer<SeparatorToken>(",");
SeparatorToken token = tokenizer.Take<SeparatorToken>(",x");
// [0:1] SeparatorToken: ","
// </04>
Console.WriteLine(token.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
}
{
// <05>
var tokenizer = new FuncTokenizer<TextToken>(mem => new TextToken { Memory = mem.Slice(0, 3) });
TextToken token = tokenizer.Take<TextToken>("____abc");
// [0:3] TextToken: "___"
// </05>
Console.WriteLine(token.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
}
{
// <05B>
var tokenizer = Tokenizers.Func<TextToken>(mem => new TextToken { Memory = mem.Slice(0, 3) });
TextToken token = tokenizer.Take<TextToken>("____abc");
// [0:3] TextToken: "___"
// </05B>
Console.WriteLine(token.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
}
{
// <06A>
var tokenizer = HexTokenizer<ValueToken>.WithoutPrefix;
IToken token = tokenizer.Take<IToken>("0123456789ABCDEF")!;
// [0:16] ValueToken: "0123456789ABCDEF"
// </06A>
Console.WriteLine(token.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
}
{
// <06B>
var tokenizer = HexTokenizer<DecimalToken>.WithPrefix;
IToken token = tokenizer.Take<IToken>("0x0123456789ABCDEF")!;
// [0:18] DecimalToken: "0x0123456789ABCDEF"
// </06B>
Console.WriteLine(token.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
}
{
// <07>
var tokenizer = IntegerTokenizer<ValueToken>.Instance;
IToken token = tokenizer.Take<IToken>("0123456789")!;
// [0:10] ValueToken: "0123456789"
// </07>
Console.WriteLine(token.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
}
{
// <08>
var tokenizer = MalformedTokenizer<MalformedToken>.Instance;
IToken token = tokenizer.Take<IToken>("¤§")!;
// [0:1] MalformedToken: "¤"
// </08>
Console.WriteLine(token.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
}
{
// <09>
var tokenizer = NewLineTokenizer<NewLineToken>.Instance;
IToken token = tokenizer.Take<IToken>("\n\n\n")!;
// [0:1] NewLineToken: "\n"
// </09>
Console.WriteLine(token.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
}
{
// <10>
var tokenizer = RealTokenizer<DecimalToken>.Instance;
IToken token = tokenizer.Take<IToken>("-123.45600e-12 asdf")!;
// [0:14] DecimalToken: "-123.45600e-12"
// </10>
Console.WriteLine(token.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
}
{
// <11>
var tokenizer = new RegexTokenizer<TextToken>("^[a-zA-Z0-9]+");
IToken token = tokenizer.Take<IToken>("ab12 cd34")!;
// [0:4] TextToken: "ab12"
// </11>
Console.WriteLine(token.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
}
{
// <12A>
var tokenizer = new SequenceTokenizer<CompositeToken>(
IntegerTokenizer.Instance,
new ConstantTokenizer<SeparatorToken>("="),
IntegerTokenizer.Instance
);
IToken token = tokenizer.Take<IToken>("10=20")!;
//[0:5] CompositeToken: "10=20"
//├── [0:2] DecimalToken: "10"
//├── [2:3] SeparatorToken: "="
//└── [3:5] DecimalToken: "20"
// </12A>
Console.WriteLine(token.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
}
{
// <12B>
var tokenizer = new SequenceTokenizer<CompositeToken>(
(WhitespaceTokenizer.Any, false),
(IntegerTokenizer.Instance, true),
(WhitespaceTokenizer.Any, false),
(new ConstantTokenizer<SeparatorToken>("="), true),
(WhitespaceTokenizer.Any, false),
(IntegerTokenizer.Instance, true),
(WhitespaceTokenizer.Any, false)
);
IToken token = tokenizer.Take<IToken>("10=20")!;
//[0:5] CompositeToken: "10=20"
//├── [0:2] DecimalToken: "10"
//├── [2:3] SeparatorToken: "="
//└── [3:5] DecimalToken: "20"
// </12B>
Console.WriteLine(token.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
}
{
// <12C>
var commaTokenizer = new ConstantTokenizer<OperandToken>(",");
var anyTokenizer = new AnyTokenizer(commaTokenizer, new UntilTokenizer(commaTokenizer));
var tokenizer =
new SequenceTokenizer<CompositeToken>(
(WhitespaceTokenizer.Any, false, false),
(anyTokenizer, true, true),
(WhitespaceTokenizer.Any, false, false),
(anyTokenizer, true, true),
(WhitespaceTokenizer.Any, false, false)
);
IToken token = tokenizer.Take<IToken>("A,B")!;
//[0:2] CompositeToken: "A,"
//├── [0:1] TextToken: "A"
//└── [1:2] OperandToken: ","
// </12C>
Console.WriteLine(token.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
}
{
// <13>
var tokenizer = new TrueTokenizer<TextToken>(new RegexTokenizer<TextToken>("^[a-zA-Z0-9]?"));
TextToken token = tokenizer.Take<TextToken>("");
// [0:0] TextToken: ""
// </13>
Console.WriteLine(token.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
}
{
// <14A>
var tokenizer = new UntilTokenizer<CompositeToken>(NewLineTokenizer.Instance);
IToken token = tokenizer.Take<IToken>("First line\nSecond line\nThird line")!;
// [0:10] CompositeToken: "First line"
// </14A>
Console.WriteLine(token.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
}
{
// <14B>
var tokenizer = new UntilTokenizer(new ConstantTokenizer(" "), false, '\\');
IToken token = tokenizer.Take<IToken>(@"a\ b\ c d e f")!;
// [0:7] TextToken: "a\\ b\\ c"
// </14B>
Console.WriteLine(token.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
}
{
// <15>
var tokenizer = new WhileTokenizer<CompositeToken>(
new AnyTokenizer(
new RegexTokenizer<TextToken>("^[a-zA-Z0-9]+"),
WhitespaceTokenizer.AllButNewLine,
NewLineTokenizer.Instance
),
yieldChildren: true
);
CompositeToken token = tokenizer.TakeAll<CompositeToken>("Hello world\nabc\n123");
// [0:19] CompositeToken: "Hello world\nabc\n123"
// ├── [0:5] TextToken: "Hello"
// ├── [5:6] WhitespaceToken: " "
// ├── [6:11] TextToken: "world"
// ├── [11:12] NewLineToken: "\n"
// ├── [12:15] TextToken: "abc"
// ├── [15:16] NewLineToken: "\n"
// └── [16:19] TextToken: "123"
// </15>
Console.WriteLine(token.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
}
{
// <16A>
var tokenizer = WhitespaceTokenizer.Any;
IToken token = tokenizer.Take<WhitespaceToken>(" \t\t \n ABC");
// [0:9] WhitespaceToken: " \n "
// </16A>
Console.WriteLine(token.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
}
{
// <16B>
var tokenizer = WhitespaceTokenizer.AllButNewLine;
IToken token = tokenizer.Take<WhitespaceToken>(" \t\t \n ABC");
// [0:6] WhitespaceToken: " "
// </16B>
Console.WriteLine(token.PrintTree(format: TokenPrintTreeExtensions.PrintFormat.DefaultLong));
}
}
}