-
Notifications
You must be signed in to change notification settings - Fork 118
Add initial lossless syntax tree implementation #443
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 3 commits
3378ef6
448896a
e357bda
867e9d7
5794a9e
50e0298
d00c229
c4513e7
8b7553e
f9e4f16
71bf26b
ba6eb66
6bb6245
d50f365
75f7cdf
47303af
dd3fdc0
9e4d36b
641a54b
7b42d02
cb9c971
6c265ed
c4b0349
ce372d6
f444d47
27f1fec
c4f73a9
cb6d9b5
7ba90b5
f079e7b
56b71b7
8a19634
3f91f7b
20fea38
b2f7986
8083e9f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,10 @@ | ||
| [*] | ||
| insert_final_newline = true | ||
| indent_size = 4 | ||
| indent_style = space | ||
|
|
||
| [*.cs] | ||
| csharp_style_namespace_declarations = file_scoped | ||
|
|
||
| [*.{csproj,targets,props}] | ||
| indent_size = 2 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,21 @@ | ||
| using Microsoft.CodeAnalysis; | ||
| using System.Resources; | ||
| using System.Runtime.CompilerServices; | ||
|
|
||
| namespace Reqnroll.CodeAnalysis.Gherkin; | ||
|
|
||
| internal static partial class DiagnosticDescriptors | ||
| { | ||
| private static ResourceManager ResourceManager { get; } = new ResourceManager(typeof(DiagnosticDescriptors)); | ||
|
|
||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| internal static string? GetResourceString(string resourceKey) => ResourceManager.GetString(resourceKey); | ||
|
|
||
| public static readonly DiagnosticDescriptor ErrorExpectedFeatureOrTag = new( | ||
| id: DiagnosticIds.ErrorExpectedFeatureOrTag, | ||
| title: GetResourceString("ErrorExpectedFeatureOrTagTitle")!, | ||
| messageFormat: GetResourceString("ErrorExpectedFeatureOrTagMessage")!, | ||
| "Reqnroll.Gherkin", | ||
| DiagnosticSeverity.Error, | ||
| true); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| ErrorExpectedFeatureOrTagMessage = Feature or tag expected | ||
| ErrorExpectedFeatureOrTagTitle = Feature or tag expected |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| namespace Reqnroll.CodeAnalysis.Gherkin; | ||
|
|
||
| internal static class DiagnosticIds | ||
| { | ||
| public const string ErrorExpectedFeatureOrTag = "G1001"; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,47 @@ | ||
| using Gherkin; | ||
| using Gherkin.Ast; | ||
|
|
||
| namespace Reqnroll.CodeAnalysis.Gherkin; | ||
|
|
||
| class DialectProvider(string defaultLanguage) : GherkinDialectProvider(defaultLanguage) | ||
| { | ||
| public override GherkinDialect GetDialect(string language, Location location) | ||
| { | ||
| try | ||
| { | ||
| return base.GetDialect(language, location); | ||
| } | ||
| catch (NoSuchLanguageException) | ||
| { | ||
| var index = language.LastIndexOf('-'); | ||
|
|
||
| if (index < 0) | ||
| { | ||
| throw; | ||
| } | ||
|
|
||
| var fallback = language.Substring(0, index); | ||
|
|
||
| return GetFallbackDialect(fallback, language, location); | ||
| } | ||
| } | ||
|
|
||
| private GherkinDialect GetFallbackDialect(string fallback, string language, Location location) | ||
| { | ||
| var dialect = GetDialect(fallback, location); | ||
|
|
||
| return new GherkinDialect( | ||
| language, | ||
| dialect.FeatureKeywords, | ||
| dialect.RuleKeywords, | ||
| dialect.BackgroundKeywords, | ||
| dialect.ScenarioKeywords, | ||
| dialect.ScenarioOutlineKeywords, | ||
| dialect.ExamplesKeywords, | ||
| dialect.GivenStepKeywords, | ||
| dialect.WhenStepKeywords, | ||
| dialect.ThenStepKeywords, | ||
| dialect.AndStepKeywords, | ||
| dialect.ButStepKeywords); | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,43 @@ | ||
| using System.Globalization; | ||
|
|
||
| namespace Reqnroll.CodeAnalysis.Gherkin; | ||
|
|
||
| public sealed class GherkinParseOptions | ||
| { | ||
| /// <summary> | ||
| /// The default parse options. | ||
| /// </summary> | ||
| public static GherkinParseOptions Default { get; } = new(); | ||
|
|
||
| /// <summary> | ||
| /// Gets the culture to use when parsing text. This determines the language that will be used for the Gherkin keywords. | ||
| /// If the document contains a language directive, the directive takes precedent. | ||
| /// </summary> | ||
| public CultureInfo Culture { get; private set; } | ||
|
|
||
| internal GherkinParseOptions(CultureInfo? culture = null) | ||
| { | ||
| Culture = culture ?? CultureInfo.CurrentCulture; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The compilation should never depend on the user's culture setting, because that causes strange behavior for teams with people from different countries and also problematic on the CI. This is why in SpecFlow/Reqnroll the overall default is en-US.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I find this a little odd and wasn't aware of this behaviour in SpecFlow and Reqnroll - the default fallback to current culture is pretty much standard across all .NET libraries. If the culture needs to be set for the purposes of running in a CI build, I think there's expectations to do it through appropriate configuration, whether that's at the CI level or via a tool-specific configuration file like ours. I don't mind changing this to match the existing behaviour, but it surprises me that it is the existing behaviour and being surprised makes me want to question it.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Your analogy does not work fully. The analogy would be if you would need to write your floating point numbers in C# depending on the culture setting of the machine where your compile. So the C# code So all in all, this does not behave like a library, much more like a programming language.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think Gherkin is special in this regard, because most (if not all) other programming languages are specified using a culture-agnostic system; you don't get keywords in your language of choice for C#, or parsing floating point values with your local number-system. For Gherkin that is the expectation: use the culture that works best to write a specification and collaborate with your team. So the suprise for me is just that the default isn't "your local culture" as it would be for other culture-sensitive tools. I appreciate that didn't work for the original authoring project, which obviously steered the choice to an explicit default.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Welcome to my world. It's hard to make statistics, but in the most of the countries (the non-English speaking ones), many (the majority?) devs work on domain culture that is not the same as their local culture. In the last 20 years I never worked on a Hungarian domain for example. There have been some Hungarian clients, but those were part of (or planned to be part of) global organizations, so the project domain language was English (few cases German). But even in UK/US you would see many devs who has their Windows set for their native language/culture.
It's hard to find a good comparison and culture-sensitivity is a complex topic. You can think about Excel for example, were you want to fix references, formulas and calculations and you want that to work the same everywhere. Excel supports culture-sensitivity, but it saves the result into the file in a culture-agnostic way. So I can make an excel sheet that uses Hungarian float numbers and make a cell with the formula I cannot really name any tool, where the created artifact would be culture-dependent and would be interpreted differently on machines with other cultures. That would lead to a chaos. I think what you mean is that if our VS extension would be culture specific, the project wizard would generate a Reqnroll project with the culture of the current user. (So for me it would generate a Reqnroll project with the default language and the sample feature file using Hungarian.) Interestingly we have tried this with SpecFlow, but we got a massive resistance from the users (probably because of the reason I mentioned above) so we had to revert it back to generating English projects by default.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for indulging me in this topic. It's been really interesting to see how the decision was made. I think the parallels with Excel are probably the closest to what we have: parse the input in one language but encode the output in an agnostic fashion (for us, C# tests.) Once we've decided how to properly encode the language configuration, I'll make |
||
| } | ||
|
|
||
| private GherkinParseOptions(GherkinParseOptions other) | ||
| { | ||
| Culture = other.Culture; | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Returns a <see cref="GherkinParseOptions"/> instance for a specified culture. | ||
| /// </summary> | ||
| /// <param name="culture">The culture to use.</param> | ||
| /// <returns>If the culture is different, a new instance of <see cref="GherkinParseOptions"/> with the specified culture; | ||
| /// otherwise the current options instance.</returns> | ||
| public GherkinParseOptions WithCulture(CultureInfo culture) | ||
| { | ||
| if (culture == Culture) | ||
| { | ||
| return this; | ||
| } | ||
|
|
||
| return new GherkinParseOptions(this) { Culture = culture }; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,133 @@ | ||
| using Gherkin; | ||
| using System.Text; | ||
| using Microsoft.CodeAnalysis; | ||
| using Microsoft.CodeAnalysis.Text; | ||
|
|
||
| namespace Reqnroll.CodeAnalysis.Gherkin; | ||
|
|
||
| using Reqnroll.CodeAnalysis.Gherkin.Syntax; | ||
|
|
||
| /// <summary> | ||
| /// A container of Gherkin syntax. | ||
| /// </summary> | ||
| /// <remarks> | ||
| /// <para>This Roslyn-like representation enables:</para> | ||
| /// <list type="bullet"> | ||
| /// <item>Simple syntactic equivalence comparison.</item> | ||
| /// <item>Full mapping of Gherkin syntax positions to generated code because this model tracks whitespace.</item> | ||
| /// <item>Future incremental parsing enhancements which rely on a model that can offset existing elements.</item> | ||
| /// </list> | ||
| /// </remarks> | ||
| public class GherkinSyntaxTree | ||
| { | ||
| private SourceText? _text; | ||
| private readonly SyntaxNode _root; | ||
|
|
||
| internal GherkinSyntaxTree(SourceText? text, SyntaxNode root, GherkinParseOptions options, string filePath) | ||
| { | ||
| _text = text; | ||
| _root = SyntaxNode.CloneAsRoot(root, this); | ||
| Options = options; | ||
| FilePath = filePath; | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Gets the options used by the parser to produce the syntax tree. | ||
| /// </summary> | ||
| public GherkinParseOptions Options { get; } | ||
|
|
||
| /// <summary> | ||
| /// Gets the path of the source document file. | ||
| /// </summary> | ||
| /// <value>The path of the source document file, or an empty string if the syntax tree is not associated with a file.</value> | ||
| public string FilePath { get; } | ||
|
|
||
| /// <summary> | ||
| /// Gets a list of all the diagnostics in the syntax tree. | ||
| /// </summary> | ||
| /// <param name="cancellationToken">A token used to signal when the operation should be canceled.</param> | ||
| /// <returns>A list of the <see cref="Diagnostic"/> instances associated with nodes in the tree.</returns> | ||
| public IEnumerable<Diagnostic> GetDiagnostics(CancellationToken cancellationToken = default) => | ||
| GetDiagnostics(GetRoot(cancellationToken)); | ||
|
|
||
| public IEnumerable<Diagnostic> GetDiagnostics(SyntaxToken token) => GetDiagnostics(token.RawNode, token.Position); | ||
|
|
||
| public IEnumerable<Diagnostic> GetDiagnostics(SyntaxTrivia trivia) => GetDiagnostics(trivia.RawNode, trivia.Position); | ||
|
|
||
| public IEnumerable<Diagnostic> GetDiagnostics(SyntaxNode node) => GetDiagnostics(node.RawNode, node.Position); | ||
|
|
||
| private IEnumerable<Diagnostic> GetDiagnostics(Syntax.Internal.RawNode? node, int position) | ||
| { | ||
| if (node != null && node.ContainsDiagnostics) | ||
| { | ||
| return EnumerateDiagnostics(node, position); | ||
| } | ||
|
|
||
| return []; | ||
| } | ||
|
|
||
| private IEnumerable<Diagnostic> EnumerateDiagnostics(Syntax.Internal.RawNode node, int position) | ||
| { | ||
| throw new NotImplementedException(); | ||
| } | ||
|
|
||
| public SyntaxNode GetRoot(CancellationToken cancellationToken = default) => _root; | ||
|
|
||
| public SourceText GetSourceText(CancellationToken cancellationToken) | ||
| { | ||
| _text ??= GetRoot(cancellationToken).GetText(); | ||
|
|
||
| return _text; | ||
| } | ||
|
|
||
| public static GherkinSyntaxTree ParseText( | ||
| string text, | ||
| GherkinParseOptions? options = null, | ||
| string path = "", | ||
| Encoding? encoding = null, | ||
| CancellationToken cancellationToken = default) | ||
| { | ||
| return ParseText( | ||
| SourceText.From(text, encoding, SourceHashAlgorithm.Sha1), | ||
| options, | ||
| path, | ||
| cancellationToken); | ||
| } | ||
|
|
||
| public static GherkinSyntaxTree ParseText( | ||
| SourceText text, | ||
| GherkinParseOptions? options = null, | ||
| string path = "", | ||
| CancellationToken cancellationToken = default) | ||
| { | ||
| #if NET6_0_OR_GREATER | ||
| ArgumentNullException.ThrowIfNull(nameof(text)); | ||
| #else | ||
| if (text == null) | ||
| { | ||
| throw new ArgumentNullException(nameof(text)); | ||
| } | ||
| #endif | ||
|
|
||
| options ??= GherkinParseOptions.Default; | ||
|
|
||
| var matcher = new TokenMatcher(new DialectProvider(options.Culture.Name)); | ||
| var builder = new Syntax.Internal.ParsedSyntaxTreeBuilder(options, text, path, cancellationToken); | ||
| var parser = new SyntaxParser(builder); | ||
| var tokens = new SourceTokenScanner(text); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why the TokenScanner and the SyntaxParser is in the root and the ParsedSyntaxTreeBuilder is in the internal?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mostly it's down to locality of the code they interact with. Most of these types only interact to implement Gherkin interfaces and are only composed here in the |
||
|
|
||
| return parser.Parse(tokens, matcher); | ||
| } | ||
|
|
||
| public Location GetLocation(TextSpan span) => | ||
| Location.Create(FilePath, span, GetSourceText(CancellationToken.None).Lines.GetLinePositionSpan(span)); | ||
|
|
||
| public SourceText GetText(CancellationToken cancellationToken) | ||
| { | ||
| _text ??= GetRoot(cancellationToken).GetText(); | ||
|
|
||
| return _text; | ||
| } | ||
|
|
||
| public override string ToString() => GetText(CancellationToken.None).ToString(); | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,16 @@ | ||
| internal static class Hash | ||
| { | ||
| private const int Prime1 = 79754099; | ||
| private const int Prime2 = 60214523; | ||
|
|
||
| public static int Combine<T1, T2, T3>(T1 value1, T2 value2, T3 value3) | ||
| { | ||
| var hash = Prime1; | ||
|
|
||
| hash = (Prime2 * hash) + value1?.GetHashCode() ?? 0; | ||
| hash = (Prime2 * hash) + value2?.GetHashCode() ?? 0; | ||
| hash = (Prime2 * hash) + value3?.GetHashCode() ?? 0; | ||
|
|
||
| return hash; | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| // Licensed to the .NET Foundation under one or more agreements. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should put these marker/polyfill classes to a separate folder ( |
||
| // The .NET Foundation licenses this file to you under the MIT license. | ||
| // See the LICENSE file in the project root for more information. | ||
|
|
||
| #if NETSTANDARD2_0 || NETSTANDARD2_1 || NETCOREAPP2_0 || NETCOREAPP2_1 || NETCOREAPP2_2 || NETCOREAPP3_0 || NETCOREAPP3_1 || NET45 || NET451 || NET452 || NET6 || NET461 || NET462 || NET47 || NET471 || NET472 || NET48 | ||
|
|
||
| using System.ComponentModel; | ||
|
|
||
| // ReSharper disable once CheckNamespace | ||
| namespace System.Runtime.CompilerServices | ||
| { | ||
| /// <summary> | ||
| /// Reserved to be used by the compiler for tracking metadata. | ||
| /// This class should not be used by developers in source code. | ||
| /// </summary> | ||
| [EditorBrowsable(EditorBrowsableState.Never)] | ||
| internal static class IsExternalInit | ||
| { | ||
| } | ||
| } | ||
|
|
||
| #endif | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We need to double check if
CultureInfois a valid representation of the default Gherkin language. Most of Gherkin language dialects match to CultureInfo names, but not all.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this has been an ongoing question for this project.
My gut feeling is that you won't be able to easily represent cultures like
en-piratethis way. There's a few things here that are a little bit challenging, since we probably do want some kind of culture support: it's not just the language choice for keywords, but also how we interpret numerical values, etc. (especially if we don't want to be using the process's culture information as a fallback.)I'm open to replacing this with something else, but I'd like us to decide whether we mean "language", "culture" or "language and an implied culture", because there's already enough ambiguity going around.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree. We do need the culture for running the step definitions. So far the concept was (not sure we always followed it) that "language" is the language notation of Gherkin
en-pirate& co, which was used for parsing the file and the "binding culture" that was a .NET culture info for running the step definitions. While you can set the binding culture explicitly, we have a mechanism to automatically derive the binding culture from the feature language (for most of the cases).Like everything, this can also be questioned, because it is clearly affects only a small portion of the feature file languages, but we cannot do that here in this PR because it would impact the other parser versions as well. So my suggestion is that in this PR, we should make a "compatible" parser with the new Roslyn style and apply conceptual changes in a separate step, otherwise it will be very hard to keep the things in sync.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So if we were to reflect the current situation, the value is an identifier of the grammar language to be used in parsing and sets a default for the execution culture (with the usual fallbacks.)
It's not required to be an ISO culture code supported by the host system, so using
CultureInfohere is inappropriate. I need to choose an alternative data-type, and I'm always hesitant to just use a string when there's an expected subset of values. In this case, could any string be a language identifier (with spaces and emojis), or does it have to follow the ISO culture-code structure?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think you can find any better than
string. As you said it is an identifier for the Gherkin language setting listed here: https://github.com/cucumber/gherkin/blob/main/gherkin-languages.json . I don't think there is any restriction for the value itself, but so far it was something built up from Latin letters and-, but I don't think this was ever written down (see https://github.com/cucumber/gherkin/blob/main/CONTRIBUTING.md#adding-or-updating-an-i18n-language).But I would like to note that the names you see at
CultureInfoare not ISO standard either. They are just a fancy combination of different standards, but the way how they are combined was invented by Microsoft. https://learn.microsoft.com/en-us/dotnet/api/system.globalization.cultureinfo.name?view=net-9.0 (highlight from me)