Skip to content
Draft
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
3378ef6
Add initial lossless syntax tree implementation
Code-Grump Feb 11, 2025
448896a
Split ParsedSyntaxTreeBuilder into file-per-rule-handler
Code-Grump Feb 11, 2025
e357bda
Added an example using the syntax tree
Code-Grump Feb 12, 2025
867e9d7
Add basic diagnostics support
Code-Grump Feb 20, 2025
5794a9e
Upgrade to Gherkin 32.0.0
Code-Grump Feb 20, 2025
50e0298
Set up for syntax classes generation
Code-Grump Apr 6, 2025
d00c229
Restructure syntax types to lean more into code-generation
Code-Grump Apr 9, 2025
c4513e7
Add generation of structured trivia syntax
Code-Grump Apr 16, 2025
8b7553e
Flatten parsing handlers
Code-Grump Apr 16, 2025
f9e4f16
Switched exceptions to use resource-file messages
Code-Grump Apr 16, 2025
71bf26b
Suppress suggestions which are not compatible with .netstandard2.0
Code-Grump Apr 16, 2025
ba6eb66
Added optional parameters to applicable generated methods
Code-Grump Apr 16, 2025
6bb6245
Add parameter groups to generate factory overloads
Code-Grump Apr 17, 2025
d50f365
Split generator types to separate files
Code-Grump Apr 17, 2025
75f7cdf
Restructure parsing tests
Code-Grump Apr 19, 2025
47303af
Add syntax types required for first parsing example
Code-Grump Apr 22, 2025
dd3fdc0
Complete generation of syntax node types
Code-Grump May 26, 2025
9e4d36b
Fix issue with resx file generation not including relative namespace
Code-Grump May 31, 2025
641a54b
Add tests for parsing based on "good" samples
Code-Grump Jun 14, 2025
7b42d02
Adjust tests to remove interpolation and add directive structures
Code-Grump Jun 21, 2025
cb9c971
Restructured tree for inheritance and replace "Scenario" with "Example"
Code-Grump Jun 21, 2025
6c265ed
Add current support for abstract syntax hierachies
Code-Grump Jun 21, 2025
c4b0349
Add data table handling
Code-Grump Aug 17, 2025
ce372d6
Make table escape sequences match documented support
Code-Grump Aug 19, 2025
f444d47
Fix ordering assertions and update complex data table test expectatio…
Code-Grump Aug 19, 2025
27f1fec
Fix token uses in expected tree
Code-Grump Aug 21, 2025
c4f73a9
Implement rule handling
Code-Grump Aug 21, 2025
cb6d9b5
Fix several syntax assertions
Code-Grump Aug 22, 2025
7ba90b5
Fix CellList handling adding or inserting nodes
Code-Grump Aug 23, 2025
f079e7b
Add "feature name missing" to diagnostic descriptors
Code-Grump Aug 23, 2025
56b71b7
Split syntax to separate files
Code-Grump Aug 23, 2025
8a19634
Add parsing support for declarations where the name is optional
Code-Grump Aug 23, 2025
3f91f7b
Replace paramter groups with syntax constructors in code generation
Code-Grump Aug 30, 2025
20fea38
Restructure tree syntax to reduce syntax nodes
Code-Grump Aug 31, 2025
b2f7986
Fix test passing wrong token type
Code-Grump Sep 1, 2025
8083e9f
Fixed DescriptionRuleHandler for multi-line descriptions
Code-Grump Sep 1, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions Reqnroll.CodeAnalysis.Gherkin/.editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[*]
insert_final_newline = true
indent_size = 4
indent_style = space

[*.cs]
csharp_style_namespace_declarations = file_scoped

[*.{csproj,targets,props}]
indent_size = 2
21 changes: 21 additions & 0 deletions Reqnroll.CodeAnalysis.Gherkin/DiagnosticDescriptors.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
using Microsoft.CodeAnalysis;
using System.Resources;
using System.Runtime.CompilerServices;

namespace Reqnroll.CodeAnalysis.Gherkin;

internal static partial class DiagnosticDescriptors
{
private static ResourceManager ResourceManager { get; } = new ResourceManager(typeof(DiagnosticDescriptors));

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static string? GetResourceString(string resourceKey) => ResourceManager.GetString(resourceKey);

public static readonly DiagnosticDescriptor ErrorExpectedFeatureOrTag = new(
id: DiagnosticIds.ErrorExpectedFeatureOrTag,
title: GetResourceString("ErrorExpectedFeatureOrTagTitle")!,
messageFormat: GetResourceString("ErrorExpectedFeatureOrTagMessage")!,
"Reqnroll.Gherkin",
DiagnosticSeverity.Error,
true);
}
2 changes: 2 additions & 0 deletions Reqnroll.CodeAnalysis.Gherkin/DiagnosticDescriptors.restext
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ErrorExpectedFeatureOrTagMessage = Feature or tag expected
ErrorExpectedFeatureOrTagTitle = Feature or tag expected
6 changes: 6 additions & 0 deletions Reqnroll.CodeAnalysis.Gherkin/DiagnosticIds.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
namespace Reqnroll.CodeAnalysis.Gherkin;

internal static class DiagnosticIds
{
public const string ErrorExpectedFeatureOrTag = "G1001";
}
47 changes: 47 additions & 0 deletions Reqnroll.CodeAnalysis.Gherkin/DialectProvider.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
using Gherkin;
using Gherkin.Ast;

namespace Reqnroll.CodeAnalysis.Gherkin;

class DialectProvider(string defaultLanguage) : GherkinDialectProvider(defaultLanguage)
{
public override GherkinDialect GetDialect(string language, Location location)
{
try
{
return base.GetDialect(language, location);
}
catch (NoSuchLanguageException)
{
var index = language.LastIndexOf('-');

if (index < 0)
{
throw;
}

var fallback = language.Substring(0, index);

return GetFallbackDialect(fallback, language, location);
}
}

private GherkinDialect GetFallbackDialect(string fallback, string language, Location location)
{
var dialect = GetDialect(fallback, location);

return new GherkinDialect(
language,
dialect.FeatureKeywords,
dialect.RuleKeywords,
dialect.BackgroundKeywords,
dialect.ScenarioKeywords,
dialect.ScenarioOutlineKeywords,
dialect.ExamplesKeywords,
dialect.GivenStepKeywords,
dialect.WhenStepKeywords,
dialect.ThenStepKeywords,
dialect.AndStepKeywords,
dialect.ButStepKeywords);
}
}
43 changes: 43 additions & 0 deletions Reqnroll.CodeAnalysis.Gherkin/GherkinParseOptions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
using System.Globalization;

namespace Reqnroll.CodeAnalysis.Gherkin;

public sealed class GherkinParseOptions
{
/// <summary>
/// The default parse options.
/// </summary>
public static GherkinParseOptions Default { get; } = new();

/// <summary>
/// Gets the culture to use when parsing text. This determines the language that will be used for the Gherkin keywords.
/// If the document contains a language directive, the directive takes precedent.
/// </summary>
public CultureInfo Culture { get; private set; }

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to double check if CultureInfo is a valid representation of the default Gherkin language. Most of Gherkin language dialects match to CultureInfo names, but not all.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this has been an ongoing question for this project.

My gut feeling is that you won't be able to easily represent cultures like en-pirate this way. There's a few things here that are a little bit challenging, since we probably do want some kind of culture support: it's not just the language choice for keywords, but also how we interpret numerical values, etc. (especially if we don't want to be using the process's culture information as a fallback.)

I'm open to replacing this with something else, but I'd like us to decide whether we mean "language", "culture" or "language and an implied culture", because there's already enough ambiguity going around.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree. We do need the culture for running the step definitions. So far the concept was (not sure we always followed it) that "language" is the language notation of Gherkin en-pirate & co, which was used for parsing the file and the "binding culture" that was a .NET culture info for running the step definitions. While you can set the binding culture explicitly, we have a mechanism to automatically derive the binding culture from the feature language (for most of the cases).
Like everything, this can also be questioned, because it is clearly affects only a small portion of the feature file languages, but we cannot do that here in this PR because it would impact the other parser versions as well. So my suggestion is that in this PR, we should make a "compatible" parser with the new Roslyn style and apply conceptual changes in a separate step, otherwise it will be very hard to keep the things in sync.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So if we were to reflect the current situation, the value is an identifier of the grammar language to be used in parsing and sets a default for the execution culture (with the usual fallbacks.)

It's not required to be an ISO culture code supported by the host system, so using CultureInfo here is inappropriate. I need to choose an alternative data-type, and I'm always hesitant to just use a string when there's an expected subset of values. In this case, could any string be a language identifier (with spaces and emojis), or does it have to follow the ISO culture-code structure?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think you can find any better than string. As you said it is an identifier for the Gherkin language setting listed here: https://github.com/cucumber/gherkin/blob/main/gherkin-languages.json . I don't think there is any restriction for the value itself, but so far it was something built up from Latin letters and -, but I don't think this was ever written down (see https://github.com/cucumber/gherkin/blob/main/CONTRIBUTING.md#adding-or-updating-an-i18n-language).

But I would like to note that the names you see at CultureInfo are not ISO standard either. They are just a fancy combination of different standards, but the way how they are combined was invented by Microsoft. https://learn.microsoft.com/en-us/dotnet/api/system.globalization.cultureinfo.name?view=net-9.0 (highlight from me)

The culture name in the format languagecode2-country/regioncode2 [...]. languagecode2 is a lowercase two-letter code as defined in ISO 639-1, or, if no two-letter code is available, a three-letter code as defined in ISO 639-3. country/regioncode2 contains a value defined in ISO 3166 and usually consists of two uppercase letters, or a BCP-47 language tag.


internal GherkinParseOptions(CultureInfo? culture = null)
{
Culture = culture ?? CultureInfo.CurrentCulture;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The compilation should never depend on the user's culture setting, because that causes strange behavior for teams with people from different countries and also problematic on the CI. This is why in SpecFlow/Reqnroll the overall default is en-US.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find this a little odd and wasn't aware of this behaviour in SpecFlow and Reqnroll - the default fallback to current culture is pretty much standard across all .NET libraries. If the culture needs to be set for the purposes of running in a CI build, I think there's expectations to do it through appropriate configuration, whether that's at the CI level or via a tool-specific configuration file like ours.

I don't mind changing this to match the existing behaviour, but it surprises me that it is the existing behaviour and being surprised makes me want to question it.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your analogy does not work fully. The analogy would be if you would need to write your floating point numbers in C# depending on the culture setting of the machine where your compile. So the C# code var f = 1.23 that compiles on yours would not compile on my machine (Hungarian culture), as you would need to write var f = 1,23. This is obviously not the case, and this is why C# language specs is deterministically set how to set the floats. The C# compiler still has some options, e.g. a code that compiles successfully with nullable disabled might cause warnings with nullable enabled, but it does not depend on environmental settings.
The feature file handling is similar to that. The parser needs to deterministically decide if a Feature keyword is valid or not during compilation time. If we would use the default culture than I could not compile your Reqnroll project as I have Hungarian culture set, where the feature keyword is Jellemző. Also in our case the "binding culture" has to be deterministic, because you might write Then the result is 1.23 into the feature file that is bound to a step definition with a double parameter. Since the feature files are representing the specification they have to be unambiguous, so it cannot happen that on my machine the test fail (because Hungarian treats . as thousand separator, or pass, because in English it is the decimal separator).
We started SpecFlow at a company who had offices in Hungary and Austria so we faced this issue right from the beginning. (Our first implementation used the current culture, but it didn't work in cross-country projects.)

So all in all, this does not behave like a library, much more like a programming language.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think Gherkin is special in this regard, because most (if not all) other programming languages are specified using a culture-agnostic system; you don't get keywords in your language of choice for C#, or parsing floating point values with your local number-system.

For Gherkin that is the expectation: use the culture that works best to write a specification and collaborate with your team.

So the suprise for me is just that the default isn't "your local culture" as it would be for other culture-sensitive tools. I appreciate that didn't work for the original authoring project, which obviously steered the choice to an explicit default.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use the culture that works best to write a specification and collaborate with your team [...] So the surprise for me is just that the default isn't "your local culture"

Welcome to my world. It's hard to make statistics, but in the most of the countries (the non-English speaking ones), many (the majority?) devs work on domain culture that is not the same as their local culture. In the last 20 years I never worked on a Hungarian domain for example. There have been some Hungarian clients, but those were part of (or planned to be part of) global organizations, so the project domain language was English (few cases German). But even in UK/US you would see many devs who has their Windows set for their native language/culture.

[...] as it would be for other culture-sensitive tools.

It's hard to find a good comparison and culture-sensitivity is a complex topic. You can think about Excel for example, were you want to fix references, formulas and calculations and you want that to work the same everywhere. Excel supports culture-sensitivity, but it saves the result into the file in a culture-agnostic way. So I can make an excel sheet that uses Hungarian float numbers and make a cell with the formula =ÖSSZEG(A:A) (this is the Hungarian version of SUM), but if you would open the file, you would see the result in UK decimal separators and the cell would show you =SUM(A:A).

I cannot really name any tool, where the created artifact would be culture-dependent and would be interpreted differently on machines with other cultures. That would lead to a chaos.

I think what you mean is that if our VS extension would be culture specific, the project wizard would generate a Reqnroll project with the culture of the current user. (So for me it would generate a Reqnroll project with the default language and the sample feature file using Hungarian.) Interestingly we have tried this with SpecFlow, but we got a massive resistance from the users (probably because of the reason I mentioned above) so we had to revert it back to generating English projects by default.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for indulging me in this topic. It's been really interesting to see how the decision was made. I think the parallels with Excel are probably the closest to what we have: parse the input in one language but encode the output in an agnostic fashion (for us, C# tests.)

Once we've decided how to properly encode the language configuration, I'll make en-US the default here.

}

private GherkinParseOptions(GherkinParseOptions other)
{
Culture = other.Culture;
}

/// <summary>
/// Returns a <see cref="GherkinParseOptions"/> instance for a specified culture.
/// </summary>
/// <param name="culture">The culture to use.</param>
/// <returns>If the culture is different, a new instance of <see cref="GherkinParseOptions"/> with the specified culture;
/// otherwise the current options instance.</returns>
public GherkinParseOptions WithCulture(CultureInfo culture)
{
if (culture == Culture)
{
return this;
}

return new GherkinParseOptions(this) { Culture = culture };
}
}
133 changes: 133 additions & 0 deletions Reqnroll.CodeAnalysis.Gherkin/GherkinSyntaxTree.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
using Gherkin;
using System.Text;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.Text;

namespace Reqnroll.CodeAnalysis.Gherkin;

using Reqnroll.CodeAnalysis.Gherkin.Syntax;

/// <summary>
/// A container of Gherkin syntax.
/// </summary>
/// <remarks>
/// <para>This Roslyn-like representation enables:</para>
/// <list type="bullet">
/// <item>Simple syntactic equivalence comparison.</item>
/// <item>Full mapping of Gherkin syntax positions to generated code because this model tracks whitespace.</item>
/// <item>Future incremental parsing enhancements which rely on a model that can offset existing elements.</item>
/// </list>
/// </remarks>
public class GherkinSyntaxTree
{
private SourceText? _text;
private readonly SyntaxNode _root;

internal GherkinSyntaxTree(SourceText? text, SyntaxNode root, GherkinParseOptions options, string filePath)
{
_text = text;
_root = SyntaxNode.CloneAsRoot(root, this);
Options = options;
FilePath = filePath;
}

/// <summary>
/// Gets the options used by the parser to produce the syntax tree.
/// </summary>
public GherkinParseOptions Options { get; }

/// <summary>
/// Gets the path of the source document file.
/// </summary>
/// <value>The path of the source document file, or an empty string if the syntax tree is not associated with a file.</value>
public string FilePath { get; }

/// <summary>
/// Gets a list of all the diagnostics in the syntax tree.
/// </summary>
/// <param name="cancellationToken">A token used to signal when the operation should be canceled.</param>
/// <returns>A list of the <see cref="Diagnostic"/> instances associated with nodes in the tree.</returns>
public IEnumerable<Diagnostic> GetDiagnostics(CancellationToken cancellationToken = default) =>
GetDiagnostics(GetRoot(cancellationToken));

public IEnumerable<Diagnostic> GetDiagnostics(SyntaxToken token) => GetDiagnostics(token.RawNode, token.Position);

public IEnumerable<Diagnostic> GetDiagnostics(SyntaxTrivia trivia) => GetDiagnostics(trivia.RawNode, trivia.Position);

public IEnumerable<Diagnostic> GetDiagnostics(SyntaxNode node) => GetDiagnostics(node.RawNode, node.Position);

private IEnumerable<Diagnostic> GetDiagnostics(Syntax.Internal.RawNode? node, int position)
{
if (node != null && node.ContainsDiagnostics)
{
return EnumerateDiagnostics(node, position);
}

return [];
}

private IEnumerable<Diagnostic> EnumerateDiagnostics(Syntax.Internal.RawNode node, int position)
{
throw new NotImplementedException();
}

public SyntaxNode GetRoot(CancellationToken cancellationToken = default) => _root;

public SourceText GetSourceText(CancellationToken cancellationToken)
{
_text ??= GetRoot(cancellationToken).GetText();

return _text;
}

public static GherkinSyntaxTree ParseText(
string text,
GherkinParseOptions? options = null,
string path = "",
Encoding? encoding = null,
CancellationToken cancellationToken = default)
{
return ParseText(
SourceText.From(text, encoding, SourceHashAlgorithm.Sha1),
options,
path,
cancellationToken);
}

public static GherkinSyntaxTree ParseText(
SourceText text,
GherkinParseOptions? options = null,
string path = "",
CancellationToken cancellationToken = default)
{
#if NET6_0_OR_GREATER
ArgumentNullException.ThrowIfNull(nameof(text));
#else
if (text == null)
{
throw new ArgumentNullException(nameof(text));
}
#endif

options ??= GherkinParseOptions.Default;

var matcher = new TokenMatcher(new DialectProvider(options.Culture.Name));
var builder = new Syntax.Internal.ParsedSyntaxTreeBuilder(options, text, path, cancellationToken);
var parser = new SyntaxParser(builder);
var tokens = new SourceTokenScanner(text);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why the TokenScanner and the SyntaxParser is in the root and the ParsedSyntaxTreeBuilder is in the internal?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mostly it's down to locality of the code they interact with. Most of these types only interact to implement Gherkin interfaces and are only composed here in the GherkinSyntaxTree so they don't reference much else. The ParsedSyntaxTreeBuilder works extensively with the "internal" model, so made sense to me to keep it off in that corner of the codebase where it is highly coupled.


return parser.Parse(tokens, matcher);
}

public Location GetLocation(TextSpan span) =>
Location.Create(FilePath, span, GetSourceText(CancellationToken.None).Lines.GetLinePositionSpan(span));

public SourceText GetText(CancellationToken cancellationToken)
{
_text ??= GetRoot(cancellationToken).GetText();

return _text;
}

public override string ToString() => GetText(CancellationToken.None).ToString();
}
16 changes: 16 additions & 0 deletions Reqnroll.CodeAnalysis.Gherkin/Hash.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
internal static class Hash
{
private const int Prime1 = 79754099;
private const int Prime2 = 60214523;

public static int Combine<T1, T2, T3>(T1 value1, T2 value2, T3 value3)
{
var hash = Prime1;

hash = (Prime2 * hash) + value1?.GetHashCode() ?? 0;
hash = (Prime2 * hash) + value2?.GetHashCode() ?? 0;
hash = (Prime2 * hash) + value3?.GetHashCode() ?? 0;

return hash;
}
}
22 changes: 22 additions & 0 deletions Reqnroll.CodeAnalysis.Gherkin/IsExternalInit.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Licensed to the .NET Foundation under one or more agreements.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should put these marker/polyfill classes to a separate folder (.polyfill?) to keep the project structure clean.

// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

#if NETSTANDARD2_0 || NETSTANDARD2_1 || NETCOREAPP2_0 || NETCOREAPP2_1 || NETCOREAPP2_2 || NETCOREAPP3_0 || NETCOREAPP3_1 || NET45 || NET451 || NET452 || NET6 || NET461 || NET462 || NET47 || NET471 || NET472 || NET48

using System.ComponentModel;

// ReSharper disable once CheckNamespace
namespace System.Runtime.CompilerServices
{
/// <summary>
/// Reserved to be used by the compiler for tracking metadata.
/// This class should not be used by developers in source code.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
internal static class IsExternalInit
{
}
}

#endif
Loading