From e1641b58508ed0a89e19ed415a74f53ea93b8216 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20L=2E=20Charlier?= Date: Sun, 22 Dec 2024 17:32:06 +0100 Subject: [PATCH] feat: align dialect definition with data package v2 (#59) * feat: 100% coverage of dialect definition for delimited files from data package v2 * feat: support commentRows dialect's description * feat: headerRows and headerJoin to define multi-lines headers --- PocketCsvReader.Testing/CharParserTest.cs | 73 ++++++++++++---- .../DialectDescriptorBuilderTest.cs | 87 ++++++++++++++----- PocketCsvReader.Testing/CsvDataReaderTest.cs | 71 +++++++++++++-- PocketCsvReader.Testing/CsvDataTableTest.cs | 2 +- PocketCsvReader.Testing/RecordParserTest.cs | 23 +++-- PocketCsvReader.Testing/StringMapperTest.cs | 8 +- PocketCsvReader/CharParser.cs | 7 ++ .../CharParsing/CharOfFieldLookupParser.cs | 7 +- .../CharParsing/CharOfFieldParser.cs | 4 +- .../FirstCharOfFieldLookupParser.cs | 14 +-- .../CharParsing/FirstCharOfFieldParser.cs | 8 +- .../FirstCharOfQuotedFieldParser.cs | 8 +- .../CharParsing/FirstCharOfRecordParser.cs | 20 ++++- .../CharParsing/LineTerminatorParser.cs | 2 +- .../Configuration/DialectDescriptor.cs | 29 +++++++ .../Configuration/DialectDescriptorBuilder.cs | 58 +++++++++---- PocketCsvReader/CsvArrayString.cs | 21 ++++- PocketCsvReader/CsvDataReader.cs | 23 ++++- PocketCsvReader/CsvDialectDescriptor.cs | 23 ----- PocketCsvReader/CsvProfile.cs | 7 +- PocketCsvReader/CsvReader.cs | 2 +- PocketCsvReader/FieldEscaper.cs | 6 +- PocketCsvReader/RecordParser.cs | 55 +++++------- docs/_docs/csv-dialect-descriptor.md | 29 ++++++- .../_docs/fluent-api-profile-configuration.md | 8 +- 25 files changed, 420 insertions(+), 175 deletions(-) create mode 100644 PocketCsvReader/Configuration/DialectDescriptor.cs delete mode 100644 PocketCsvReader/CsvDialectDescriptor.cs diff --git a/PocketCsvReader.Testing/CharParserTest.cs b/PocketCsvReader.Testing/CharParserTest.cs index 9a6eb49..777af92 100644 --- a/PocketCsvReader.Testing/CharParserTest.cs +++ b/PocketCsvReader.Testing/CharParserTest.cs @@ -57,7 +57,7 @@ public void Parse_FieldLineTerminator_StartEnd(string value, string sep, int sta public void Parse_FieldLineTerminatorSingleChar_StartEnd(string value, string sep, int start, int length) { var parser = new CharParser(new CsvProfile( - new CsvDialectDescriptor() { Delimiter = ',', QuoteChar = '\'', LineTerminator = sep })); + new DialectDescriptor() { Delimiter = ',', QuoteChar = '\'', LineTerminator = sep })); var result = value.Aggregate((ParserState?)null, (current, c) => parser.Parse(c)); Assert.That(result, Is.EqualTo(ParserState.Record)); @@ -71,7 +71,7 @@ public void Parse_FieldLineTerminatorSingleChar_StartEnd(string value, string se [TestCase("#foobar")] public void Parse_Comment_StartEnd(string value) { - var parser = new CharParser(new CsvProfile(new CsvDialectDescriptor() {CommentChar='#', Delimiter=';', LineTerminator="\r\n" })); + var parser = new CharParser(new CsvProfile(new DialectDescriptor() { Header = false, CommentChar = '#', Delimiter = ';', LineTerminator = "\r\n" })); var result = value.Aggregate((ParserState?)null, (current, c) => parser.Parse(c)); Assert.That(result, Is.EqualTo(ParserState.Continue)); @@ -83,7 +83,7 @@ public void Parse_Comment_StartEnd(string value) [TestCase("bar")] public void Parse_AfterComment_StartEnd(string value) { - var parser = new CharParser(new CsvProfile(new CsvDialectDescriptor() { CommentChar = '#', Delimiter = ';', LineTerminator = "\r\n" })); + var parser = new CharParser(new CsvProfile(new DialectDescriptor() { CommentChar = '#', Delimiter = ';', LineTerminator = "\r\n" })); var result = value.Aggregate((ParserState?)null, (current, c) => parser.Parse(c)); result = parser.ParseEof(); @@ -97,9 +97,9 @@ public void Parse_AfterComment_StartEnd(string value) [TestCase("bar\r\n", 1)] public void Parse_Record_CountOfField(string value, int count) { - var parser = new CharParser(new CsvProfile(new CsvDialectDescriptor() { Delimiter = ';', LineTerminator = "\r\n" })); + var parser = new CharParser(new CsvProfile(new DialectDescriptor() { Delimiter = ';', LineTerminator = "\r\n" })); var result = value.Aggregate(0, (current, c) - => parser.Parse(c) != ParserState.Continue ? current+1 : current); + => parser.Parse(c) != ParserState.Continue ? current + 1 : current); Assert.That(result, Is.EqualTo(count)); } @@ -111,7 +111,18 @@ public void Parse_Record_CountOfField(string value, int count) [TestCase("bar;\r\nfoo\r\n", 2)] public void Parse_Record_CountOfRecord(string value, int count) { - var parser = new CharParser(new CsvProfile(new CsvDialectDescriptor() { Delimiter = ';', LineTerminator = "\r\n" })); + var parser = new CharParser(new CsvProfile(new DialectDescriptor() { Header= false, Delimiter = ';', LineTerminator = "\r\n" })); + var result = value.Aggregate(0, (current, c) + => parser.Parse(c) == ParserState.Record ? current + 1 : current); + + Assert.That(result, Is.EqualTo(count)); + } + + [TestCase("field_1;field_2\r\nfoo;bar\r\nfoo;bar\r\nfoo;bar\r\n", 3)] + [TestCase("field_1\r\nbar\r\n", 1)] + public void Parse_RecordAndHeader_CountOfRecord(string value, int count) + { + var parser = new CharParser(new CsvProfile(new DialectDescriptor() { Header = true, Delimiter = ';', LineTerminator = "\r\n" })); var result = value.Aggregate(0, (current, c) => parser.Parse(c) == ParserState.Record ? current + 1 : current); @@ -127,7 +138,7 @@ public void Parse_Record_CountOfRecord(string value, int count) [TestCase("'f\ro\no';", "f\ro\no")] public void Parse_QuotedField_CorrectField(string value, string expected) { - var parser = new CharParser(new CsvProfile(new CsvDialectDescriptor() { QuoteChar='\'', Delimiter = ';', LineTerminator = "\r\n" })); + var parser = new CharParser(new CsvProfile(new DialectDescriptor() { QuoteChar = '\'', Delimiter = ';', LineTerminator = "\r\n" })); var result = string.Empty; foreach (var c in value) if (parser.Parse(c) == ParserState.Field) @@ -141,7 +152,7 @@ public void Parse_QuotedField_CorrectField(string value, string expected) public void Parse_DoubleQuotedFieldWhenDenied_Error(string value) { var parser = new CharParser(new CsvProfile( - new CsvDialectDescriptor() { QuoteChar = '\'', EscapeChar = '\\', DoubleQuote = false, Delimiter = ';', LineTerminator = "\r\n" })); + new DialectDescriptor() { QuoteChar = '\'', EscapeChar = '\\', DoubleQuote = false, Delimiter = ';', LineTerminator = "\r\n" })); var result = string.Empty; foreach (var c in value) if (parser.Parse(c) == ParserState.Error) @@ -155,7 +166,7 @@ public void Parse_DoubleQuotedFieldWhenDenied_Error(string value) public void Parse_DoubleQuotedFieldWhenAllowed_EscapedSet(string value) { var parser = new CharParser(new CsvProfile( - new CsvDialectDescriptor() { QuoteChar = '`', EscapeChar = '%', DoubleQuote = true, Delimiter = ';', LineTerminator = "\r\n" })); + new DialectDescriptor() { QuoteChar = '`', EscapeChar = '%', DoubleQuote = true, Delimiter = ';', LineTerminator = "\r\n" })); foreach (var c in value) if (parser.Parse(c) == ParserState.Field) { @@ -171,7 +182,7 @@ public void Parse_DoubleQuotedFieldWhenAllowed_EscapedSet(string value) public void Parse_EscapeQuoteInQuotedField_EscapedSet(string value) { var parser = new CharParser(new CsvProfile( - new CsvDialectDescriptor() { QuoteChar = '`', EscapeChar = '%', DoubleQuote = false, Delimiter = ';', LineTerminator = "\r\n" })); + new DialectDescriptor() { QuoteChar = '`', EscapeChar = '%', DoubleQuote = false, Delimiter = ';', LineTerminator = "\r\n" })); foreach (var c in value) if (parser.Parse(c) == ParserState.Field) { @@ -187,7 +198,7 @@ public void Parse_EscapeQuoteInQuotedField_EscapedSet(string value) public void Parse_EscapeDelimiterInUnquotedField_EscapedSet(string value) { var parser = new CharParser(new CsvProfile( - new CsvDialectDescriptor() { QuoteChar = '`', EscapeChar = '%', DoubleQuote = false, Delimiter = ';', LineTerminator = "\r\n" })); + new DialectDescriptor() { QuoteChar = '`', EscapeChar = '%', DoubleQuote = false, Delimiter = ';', LineTerminator = "\r\n" })); foreach (var c in value) if (parser.Parse(c) == ParserState.Field) { @@ -203,7 +214,7 @@ public void Parse_EscapeDelimiterInUnquotedField_EscapedSet(string value) public void Parse_SkipInitialSpace_SpaceSkip(string value, int start) { var parser = new CharParser(new CsvProfile( - new CsvDialectDescriptor() { SkipInitialSpace = true, QuoteChar = '`', EscapeChar = '%', DoubleQuote = false, Delimiter = ';', LineTerminator = "\r\n" })); + new DialectDescriptor() { SkipInitialSpace = true, QuoteChar = '`', EscapeChar = '%', DoubleQuote = false, Delimiter = ';', LineTerminator = "\r\n" })); foreach (var c in value) parser.Parse(c); Assert.That(parser.FieldStart, Is.EqualTo(start)); @@ -216,7 +227,7 @@ public void Parse_SkipInitialSpace_SpaceSkip(string value, int start) public void Parse_SkipInitialSpaceBeforeQuotedField_SpaceSkip(string value, int start) { var parser = new CharParser(new CsvProfile( - new CsvDialectDescriptor() { SkipInitialSpace = true, QuoteChar = '`', EscapeChar = '%', DoubleQuote = false, Delimiter = ';', LineTerminator = "\r\n" })); + new DialectDescriptor() { SkipInitialSpace = true, QuoteChar = '`', EscapeChar = '%', DoubleQuote = false, Delimiter = ';', LineTerminator = "\r\n" })); foreach (var c in value) parser.Parse(c); Assert.That(parser.FieldStart, Is.EqualTo(start)); @@ -229,10 +240,42 @@ public void Parse_SkipInitialSpaceBeforeQuotedField_SpaceSkip(string value, int public void Parse_SkipInitialSpaceWithinQuotedField_SpaceNotSkip(string value, int start) { var parser = new CharParser(new CsvProfile( - new CsvDialectDescriptor() { SkipInitialSpace = true, QuoteChar = '`', EscapeChar = '%', DoubleQuote = false, Delimiter = ';', LineTerminator = "\r\n" })); + new DialectDescriptor() { SkipInitialSpace = true, QuoteChar = '`', EscapeChar = '%', DoubleQuote = false, Delimiter = ';', LineTerminator = "\r\n" })); foreach (var c in value) parser.Parse(c); Assert.That(parser.FieldStart, Is.EqualTo(start)); - Assert.That(parser.FieldLength, Is.EqualTo(value.Length-7)); + Assert.That(parser.FieldLength, Is.EqualTo(value.Length - 7)); + } + + [TestCase("foo\r\nbar\r\n")] + [TestCase("Comment\r\nfoo\r\nbar\r\n", 1)] + [TestCase("Comment 1\r\nComment 2\r\nfoo\r\nbar\r\n", 1, 2)] + [TestCase("Comment 1\r\nComment 2\r\nfoo\r\nbar\r\nComment 3", 1, 2, 5)] + [TestCase("Comment 1\r\n\r\nfooComment 2\r\nbar\r\nComment 3", 1, 3, 5)] + public void Parse_CommentRows_CommentsSkipped(string value, params int[] commentRows) + { + var parser = new CharParser(new CsvProfile( + new DialectDescriptor() { Header = false, CommentRows = commentRows, LineTerminator = "\r\n" })); + var recordCount = 0; + foreach (var c in value) + if (parser.Parse(c) == ParserState.Record) + recordCount++; + Assert.That(recordCount, Is.EqualTo(2)); + } + + [TestCase("foo\r\nbar\r\n")] + [TestCase("Comment\r\nfoo\r\nbar\r\n#Comment", 1)] + [TestCase("Comment 1\r\nComment 2\r\nfoo\r\n#Comment\r\nbar\r\n#Comment", 1, 2)] + [TestCase("Comment 1\r\nComment 2\r\nfoo\r\n\r\n#Commentbar\r\nComment 3", 1, 2, 6)] + [TestCase("Comment 1\r\n\r\nfooComment 2\r\nbar\r\n#Comment\r\nComment 3", 1, 3, 6)] + public void Parse_CommentRowsAndComments_CommentsSkipped(string value, params int[] commentRows) + { + var parser = new CharParser(new CsvProfile( + new DialectDescriptor() { Header = false, CommentChar = '#', CommentRows = commentRows, LineTerminator = "\r\n" })); + var recordCount = 0; + foreach (var c in value) + if (parser.Parse(c) == ParserState.Record) + recordCount++; + Assert.That(recordCount, Is.EqualTo(2)); } } diff --git a/PocketCsvReader.Testing/Configuration/DialectDescriptorBuilderTest.cs b/PocketCsvReader.Testing/Configuration/DialectDescriptorBuilderTest.cs index b9243f6..93fda21 100644 --- a/PocketCsvReader.Testing/Configuration/DialectDescriptorBuilderTest.cs +++ b/PocketCsvReader.Testing/Configuration/DialectDescriptorBuilderTest.cs @@ -198,6 +198,19 @@ public void WithHeader_ShouldSetHeaderToTrue() .Build(); Assert.That(descriptor.Header, Is.True); + Assert.That(descriptor.HeaderRows, Is.Not.Null.And.Not.Empty); + } + + [Test] + public void SwitchHeaderValue_ShouldSetHeaderToTrue() + { + var descriptor = new DialectDescriptorBuilder() + .WithoutHeader() + .WithHeader() + .Build(); + + Assert.That(descriptor.Header, Is.True); + Assert.That(descriptor.HeaderRows, Is.Not.Null.And.Not.Empty); } [Test] @@ -208,64 +221,92 @@ public void WithoutHeader_ShouldSetHeaderToFalse() .Build(); Assert.That(descriptor.Header, Is.False); + Assert.That(descriptor.HeaderRows, Is.Empty); } [Test] - [TestCase("#")] - [TestCase("/")] - public void WithCommentChar_ShouldSetCommentChar(char commentChar) + [TestCase(" ")] + [TestCase("-")] + [TestCase(" - ")] + public void WithHeaderJoin_ShouldSetHeaderJoin(string join) { var descriptor = new DialectDescriptorBuilder() - .WithCommentChar(commentChar) + .WithHeaderJoin(join) .Build(); - Assert.That(descriptor.CommentChar, Is.EqualTo(commentChar)); + Assert.That(descriptor.HeaderJoin, Is.EqualTo(join)); } [Test] - [TestCase(CommentChar.Hash, '#')] - [TestCase(CommentChar.ForwardSlash, '/')] - [TestCase(CommentChar.Dash, '-')] - [TestCase(CommentChar.Semicolon, ';')] - public void WithCommentChar_ShouldSetCommentChar(CommentChar commentChar, char value) + [TestCase(1)] + [TestCase(1, 2, 3)] + public void WithHeaderRows_ShouldSetHeaderRows(params int[] rows) { var descriptor = new DialectDescriptorBuilder() - .WithCommentChar(commentChar) + .WithHeaderRows(rows) .Build(); - Assert.That(descriptor.CommentChar, Is.EqualTo(value)); + + Assert.That(descriptor.HeaderRows, Is.EqualTo(rows)); } + [Test] + public void WithHeaderRowsEmpty_ShouldSetHeaderRowsAndHeader() + { + var descriptor = new DialectDescriptorBuilder() + .WithHeaderRows([]) + .Build(); + + Assert.That(descriptor.HeaderRows, Is.Empty); + Assert.That(descriptor.Header, Is.False); + } [Test] - [TestCase(true)] - [TestCase(false)] - public void WithCaseSensitiveHeader_ShouldSetCaseSensitiveHeaderToValue(bool value) + public void WithoutHeaderRows_ShouldSetHeaderRowsAndHeader() { var descriptor = new DialectDescriptorBuilder() - .WithCaseSensitiveHeader(value) + .WithoutHeaderRows() .Build(); - Assert.That(descriptor.CaseSensitiveHeader, Is.EqualTo(value)); + Assert.That(descriptor.HeaderRows, Is.Empty); + Assert.That(descriptor.Header, Is.False); } [Test] - public void WithCaseSensitiveHeader_ShouldSetCaseSensitiveHeaderToTrue() + [TestCase("#")] + [TestCase("/")] + public void WithCommentChar_ShouldSetCommentChar(char commentChar) { var descriptor = new DialectDescriptorBuilder() - .WithCaseSensitiveHeader() + .WithCommentChar(commentChar) .Build(); - Assert.That(descriptor.CaseSensitiveHeader, Is.True); + Assert.That(descriptor.CommentChar, Is.EqualTo(commentChar)); + } + + [Test] + [TestCase(CommentChar.Hash, '#')] + [TestCase(CommentChar.ForwardSlash, '/')] + [TestCase(CommentChar.Dash, '-')] + [TestCase(CommentChar.Semicolon, ';')] + public void WithCommentChar_ShouldSetCommentChar(CommentChar commentChar, char value) + { + var descriptor = new DialectDescriptorBuilder() + .WithCommentChar(commentChar) + .Build(); + Assert.That(descriptor.CommentChar, Is.EqualTo(value)); } [Test] - public void WithCaseSensitiveHeader_WithoutParameter_ShouldSetCaseSensitiveHeaderToFalse() + [TestCase()] + [TestCase(1)] + [TestCase(1, 2, 3)] + public void WithCommentRows_ShouldSetCommentChar(params int[] rows) { var descriptor = new DialectDescriptorBuilder() - .WithoutCaseSensitiveHeader() + .WithCommentRows(rows) .Build(); - Assert.That(descriptor.CaseSensitiveHeader, Is.False); + Assert.That(descriptor.CommentRows, Is.EqualTo(rows)); } [Test] diff --git a/PocketCsvReader.Testing/CsvDataReaderTest.cs b/PocketCsvReader.Testing/CsvDataReaderTest.cs index eb1742f..9f2480b 100644 --- a/PocketCsvReader.Testing/CsvDataReaderTest.cs +++ b/PocketCsvReader.Testing/CsvDataReaderTest.cs @@ -58,7 +58,7 @@ public void GetString_RecordWithTwoFields_CorrectParsing(string record, string f var buffer = new MemoryStream(Encoding.UTF8.GetBytes(record)); var profile = new CsvProfile( - new CsvDialectDescriptor() { Delimiter = ';', QuoteChar = '\'', DoubleQuote = true, Header = false }); + new DialectDescriptor() { Delimiter = ';', QuoteChar = '\'', DoubleQuote = true, Header = false }); using var dataReader = new CsvDataReader(buffer, profile); dataReader.Read(); Assert.That(dataReader.GetString(0), Is.EqualTo(firstToken)); @@ -71,7 +71,7 @@ public void GetInt32_RecordWithTwoFields_CorrectParsing() var buffer = new MemoryStream(Encoding.UTF8.GetBytes("foo;17")); var profile = new CsvProfile( - new CsvDialectDescriptor() { Delimiter = ';', QuoteChar = '\'', DoubleQuote = true, Header = false }); + new DialectDescriptor() { Delimiter = ';', QuoteChar = '\'', DoubleQuote = true, Header = false }); using var dataReader = new CsvDataReader(buffer, profile); dataReader.Read(); Assert.That(dataReader.GetString(0), Is.EqualTo("foo")); @@ -84,7 +84,7 @@ public void GetDecimal_RecordWithTwoFields_CorrectParsing() var buffer = new MemoryStream(Encoding.UTF8.GetBytes("foo;17.02542")); var profile = new CsvProfile( - new CsvDialectDescriptor() { Delimiter = ';', QuoteChar = '\'', DoubleQuote = true, Header = false }); + new DialectDescriptor() { Delimiter = ';', QuoteChar = '\'', DoubleQuote = true, Header = false }); using var dataReader = new CsvDataReader(buffer, profile); dataReader.Read(); Assert.That(dataReader.GetString(0), Is.EqualTo("foo")); @@ -97,7 +97,7 @@ public void GetDateTime_RecordWithTwoFields_CorrectParsing() var buffer = new MemoryStream(Encoding.UTF8.GetBytes("foo;2024-12-06T12:45:16")); var profile = new CsvProfile( - new CsvDialectDescriptor() { Delimiter = ';', QuoteChar = '\'', DoubleQuote = true, Header = false }); + new DialectDescriptor() { Delimiter = ';', QuoteChar = '\'', DoubleQuote = true, Header = false }); using var dataReader = new CsvDataReader(buffer, profile); dataReader.Read(); Assert.That(dataReader.GetString(0), Is.EqualTo("foo")); @@ -107,7 +107,7 @@ public void GetDateTime_RecordWithTwoFields_CorrectParsing() [Test] [TestCase("'fo\\'o'", '\\')] [TestCase("'fo?'o'", '?')] - public void ReadNextRecord_SingleFieldWithTextEscaper_CorrectParsing(string record, char escapeTextQualifier) + public void Read_SingleFieldWithTextEscaper_CorrectParsing(string record, char escapeTextQualifier) { var buffer = new MemoryStream(Encoding.UTF8.GetBytes(record)); @@ -120,12 +120,12 @@ public void ReadNextRecord_SingleFieldWithTextEscaper_CorrectParsing(string reco [Test] [TestCase("'fo''o'")] - public void ReadNextRecord_SingleFieldWithDoubleQuote_CorrectParsing(string record) + public void Read_SingleFieldWithDoubleQuote_CorrectParsing(string record) { var buffer = new MemoryStream(Encoding.UTF8.GetBytes(record)); var profile = new CsvProfile( - new CsvDialectDescriptor() { Delimiter = ';', QuoteChar = '\'', DoubleQuote = true, Header = false } + new DialectDescriptor() { Delimiter = ';', QuoteChar = '\'', DoubleQuote = true, Header = false } ); using var dataReader = new CsvDataReader(buffer, profile); dataReader.Read(); @@ -133,6 +133,63 @@ public void ReadNextRecord_SingleFieldWithDoubleQuote_CorrectParsing(string reco Assert.That(dataReader.GetString(0), Is.EqualTo("fo'o")); } + + [Test] + [TestCase("field0;field1\r\nfoo;bar")] + public void Read_WithHeader_CorrectParsing(string record) + { + var buffer = new MemoryStream(Encoding.UTF8.GetBytes(record)); + + var profile = new CsvProfile( + new DialectDescriptor() { Delimiter = ';', Header = true } + ); + using var dataReader = new CsvDataReader(buffer, profile); + dataReader.Read(); + Assert.That(dataReader.FieldCount, Is.EqualTo(2)); + Assert.That(dataReader.GetName(0), Is.EqualTo("field0")); + Assert.That(dataReader.GetName(1), Is.EqualTo("field1")); + Assert.That(dataReader.GetString(0), Is.EqualTo("foo")); + Assert.That(dataReader.GetString(1), Is.EqualTo("bar")); + } + + [Test] + [TestCase("field;field\r\n0;1\r\nfoo;bar")] + [TestCase("field\r\n0;1\r\nfoo;bar")] + public void Read_WithHeaders_CorrectParsing(string record) + { + var buffer = new MemoryStream(Encoding.UTF8.GetBytes(record)); + + var profile = new CsvProfile( + new DialectDescriptor() { Delimiter = ';', HeaderRows = [1,2], HeaderJoin="." } + ); + using var dataReader = new CsvDataReader(buffer, profile); + dataReader.Read(); + Assert.That(dataReader.FieldCount, Is.EqualTo(2)); + Assert.That(dataReader.GetName(0), Is.EqualTo("field.0")); + Assert.That(dataReader.GetName(1), Is.EqualTo("field.1")); + Assert.That(dataReader.GetString(0), Is.EqualTo("foo")); + Assert.That(dataReader.GetString(1), Is.EqualTo("bar")); + } + + [Test] + [TestCase("field;field\r\n0;1\r\nfoo;bar")] + [TestCase("field\r\n0;1\r\nfoo;bar")] + public void Read_WithHeadersSkippingSomeRows_CorrectParsing(string record) + { + var buffer = new MemoryStream(Encoding.UTF8.GetBytes(record)); + + var profile = new CsvProfile( + new DialectDescriptor() { Delimiter = ';', HeaderRows = [2], HeaderJoin = "." } + ); + using var dataReader = new CsvDataReader(buffer, profile); + dataReader.Read(); + Assert.That(dataReader.FieldCount, Is.EqualTo(2)); + Assert.That(dataReader.GetName(0), Is.EqualTo("0")); + Assert.That(dataReader.GetName(1), Is.EqualTo("1")); + Assert.That(dataReader.GetString(0), Is.EqualTo("foo")); + Assert.That(dataReader.GetString(1), Is.EqualTo("bar")); + } + [Test] [TestCase("Ansi")] [TestCase("Utf16-BE")] diff --git a/PocketCsvReader.Testing/CsvDataTableTest.cs b/PocketCsvReader.Testing/CsvDataTableTest.cs index 62a1c00..fae24b4 100644 --- a/PocketCsvReader.Testing/CsvDataTableTest.cs +++ b/PocketCsvReader.Testing/CsvDataTableTest.cs @@ -188,7 +188,7 @@ public void Read_MissingValue_MatchWithNullValue() public void Read_Comment_CommentedLinesSkipped(string content) { using var stream = new MemoryStream(Encoding.UTF8.GetBytes(content)); - var profile = new CsvProfile(new CsvDialectDescriptor { Header = false, Delimiter = ';', CommentChar = '#', DoubleQuote = false }); + var profile = new CsvProfile(new DialectDescriptor { Header = false, Delimiter = ';', CommentChar = '#', DoubleQuote = false }); var reader = new CsvReader(profile); var dataTable = reader.ToDataTable(stream); Assert.That(dataTable.Rows.Count, Is.EqualTo(2)); diff --git a/PocketCsvReader.Testing/RecordParserTest.cs b/PocketCsvReader.Testing/RecordParserTest.cs index cb32103..1a82bbe 100644 --- a/PocketCsvReader.Testing/RecordParserTest.cs +++ b/PocketCsvReader.Testing/RecordParserTest.cs @@ -86,7 +86,7 @@ public void ReadNextRecord_RecordWithTwoFields_CorrectParsing(string record, str var buffer = new MemoryStream(Encoding.UTF8.GetBytes(record)); var profile = new CsvProfile( - new CsvDialectDescriptor() { Delimiter = ';', QuoteChar = '\'', DoubleQuote = true }); + new DialectDescriptor() { Delimiter = ';', QuoteChar = '\'', DoubleQuote = true }); using var reader = new RecordParser(new StreamReader(buffer), profile, ArrayPool.Create(256, 5)); reader.ReadNextRecord(out var values); Assert.That(values.Slice(0).ToString(), Is.EqualTo(firstToken)); @@ -179,8 +179,8 @@ public void ReadNextRecord_SkipInitialWhitespace_CorrectResults(string record) { using var stream = new MemoryStream(Encoding.UTF8.GetBytes(record)); - var profile = new CsvProfile(';', '`'); - profile.Descriptor.SkipInitialSpace = true; + var dialect = new DialectDescriptor() with { Delimiter = ';', QuoteChar = '`', SkipInitialSpace = true }; + var profile = new CsvProfile(dialect); using var reader = new RecordParser(new StreamReader(stream), profile, ArrayPool.Create(256, 5)); using var streamReader = new StreamReader(stream); reader.ReadNextRecord(out var values); @@ -191,8 +191,9 @@ public void ReadNextRecord_SkipInitialWhitespace_CorrectResults(string record) [Test] [TestCase("foo;bar\r\n1;2\r\n3;4", true, "foo", "bar")] - [TestCase("foo;\r\n1;2\r\n3;4", true, "foo", "field_1")] - [TestCase("1;2\r\n3;4", false, "field_0", "field_1")] + [TestCase("foo;\r\n1;2\r\n3;4", true, "foo", "")] + [TestCase("foo\r\n1;2\r\n3;4", true, "foo")] + [TestCase("1;2\r\n3;4", false)] public void ReadHeaders_Record_CorrectResult(string text, bool hasHeader, params string[] headers) { var buffer = new MemoryStream(Encoding.UTF8.GetBytes(text)); @@ -200,9 +201,13 @@ public void ReadHeaders_Record_CorrectResult(string text, bool hasHeader, params var profile = new CsvProfile(';', '`', "\r\n", hasHeader); using var reader = new RecordParser(new StreamReader(buffer), profile, ArrayPool.Create(256, 5)); var values = reader.ReadHeaders(); - Assert.That(values, Has.Length.EqualTo(2)); - Assert.That(values[0], Is.EqualTo(headers[0])); - Assert.That(values[1], Is.EqualTo(headers[1])); + Assert.That(values, Has.Length.EqualTo(Convert.ToInt32(hasHeader))); + if (hasHeader) + { + Assert.That(values[0], Has.Length.EqualTo(headers.Length)); + for (int i = 0; i < headers.Length; i++) + Assert.That(values[0][i], Is.EqualTo(headers[i])); + } } [Test] @@ -338,7 +343,7 @@ public void ReadNextRecord_NullField_NullValue() { var buffer = new MemoryStream(Encoding.UTF8.GetBytes("a;(null)")); - var profile = new CsvProfile(new CsvDialectDescriptor() { Delimiter = ';', NullSequence = "(null)", Header = false }); + var profile = new CsvProfile(new DialectDescriptor() { Delimiter = ';', NullSequence = "(null)", Header = false }); using var reader = new CsvDataReader(buffer, profile); Assert.That(reader.Read(), Is.True); Assert.That(reader.IsDBNull(0), Is.False); diff --git a/PocketCsvReader.Testing/StringMapperTest.cs b/PocketCsvReader.Testing/StringMapperTest.cs index 92b988d..5ca51ff 100644 --- a/PocketCsvReader.Testing/StringMapperTest.cs +++ b/PocketCsvReader.Testing/StringMapperTest.cs @@ -44,7 +44,7 @@ public void ReadField_Null_CorrectString(string item, string result) item.AsSpan().CopyTo(buffer); buffer = buffer.Slice(0, item.Length); - var profile = new CsvProfile(new CsvDialectDescriptor { NullSequence = "(null)" }); + var profile = new CsvProfile(new DialectDescriptor { NullSequence = "(null)" }); var mapper = new StringMapper(profile); var value = mapper.Parse(buffer, false, false); Assert.That(value, Is.EqualTo(result)); @@ -58,7 +58,7 @@ public void ReadField_NullButQuoted_CorrectString(string item, string result) item.AsSpan().CopyTo(buffer); buffer = buffer.Slice(0, item.Length); - var profile = new CsvProfile(new CsvDialectDescriptor { NullSequence = "(null)" }); + var profile = new CsvProfile(new DialectDescriptor { NullSequence = "(null)" }); var mapper = new StringMapper(profile); var value = mapper.Parse(buffer.Slice(1, item.Length - 2), false, true); Assert.That(value, Is.EqualTo(result)); @@ -195,7 +195,7 @@ public void ReadField_NullSequence_NullValue(string field, string NullSequence) field.AsSpan().CopyTo(buffer); buffer = buffer.Slice(0, field.Length); - var profile = new CsvProfile(new CsvDialectDescriptor { NullSequence = NullSequence }); + var profile = new CsvProfile(new DialectDescriptor { NullSequence = NullSequence }); var mapper = new StringMapper(profile); var value = mapper.Parse(buffer, false, false); @@ -211,7 +211,7 @@ public void ReadField_WrongNullSequence_NotNullValue(string field, string NullSe field.AsSpan().CopyTo(buffer); buffer = buffer.Slice(0, field.Length); - var profile = new CsvProfile(new CsvDialectDescriptor { NullSequence = NullSequence }); + var profile = new CsvProfile(new DialectDescriptor { NullSequence = NullSequence }); var mapper = new StringMapper(profile); var value = mapper.Parse(buffer, false, false); diff --git a/PocketCsvReader/CharParser.cs b/PocketCsvReader/CharParser.cs index 931cc09..4c7be02 100644 --- a/PocketCsvReader/CharParser.cs +++ b/PocketCsvReader/CharParser.cs @@ -9,11 +9,13 @@ namespace PocketCsvReader; public class CharParser { + public int RowNumber { get; set; } = 0; public int Position { get; private set; } = -1; public int FieldStart { get; private set; } = 0; public int FieldLength { get; private set; } = 0; public bool IsQuotedField { get; private set; } = false; public bool IsEscapedField { get; private set; } = false; + public bool IsHeaderRow { get; private set; } = false; public CsvProfile Profile { get; } internal delegate ParserState InternalParse(char c); @@ -112,6 +114,10 @@ internal void SetQuotedField() [MethodImpl(MethodImplOptions.AggressiveInlining)] internal void SetEscapedField() => IsEscapedField = true; + internal void SetHeaderRow() + => IsHeaderRow = true; + internal void UnsetHeaderRow() + => IsHeaderRow = false; [MethodImpl(MethodImplOptions.AggressiveInlining)] internal void Switch(InternalParse parse) => Internal = parse; @@ -123,5 +129,6 @@ public enum ParserState Error, Field, Record, + Header, Eof, } diff --git a/PocketCsvReader/CharParsing/CharOfFieldLookupParser.cs b/PocketCsvReader/CharParsing/CharOfFieldLookupParser.cs index adbdd99..31e3340 100644 --- a/PocketCsvReader/CharParsing/CharOfFieldLookupParser.cs +++ b/PocketCsvReader/CharParsing/CharOfFieldLookupParser.cs @@ -11,7 +11,7 @@ internal class CharOfFieldLookupParser : IInternalCharParser protected readonly bool[] InterestingChars; private char FirstCharOfLineTerminator { get; set; } private char Delimiter { get; set; } - private char EscapeChar { get; set; } + private char? EscapeChar { get; set; } public CharOfFieldLookupParser(CharParser parser) { @@ -22,7 +22,8 @@ public CharOfFieldLookupParser(CharParser parser) InterestingChars = new bool[char.MaxValue + 1]; InterestingChars[Delimiter] = true; InterestingChars[FirstCharOfLineTerminator] = true; - InterestingChars[EscapeChar] = true; + if (EscapeChar.HasValue) + InterestingChars[EscapeChar.Value] = true; } public virtual ParserState Parse(char c) @@ -46,7 +47,7 @@ public virtual ParserState Parse(char c) : ParserState.Continue; } - if (c == EscapeChar) + if (EscapeChar.HasValue && c == EscapeChar) { Parser.Switch(Parser.AfterEscapeChar); return ParserState.Continue; diff --git a/PocketCsvReader/CharParsing/CharOfFieldParser.cs b/PocketCsvReader/CharParsing/CharOfFieldParser.cs index dab41e8..f81145d 100644 --- a/PocketCsvReader/CharParsing/CharOfFieldParser.cs +++ b/PocketCsvReader/CharParsing/CharOfFieldParser.cs @@ -10,7 +10,7 @@ internal class CharOfFieldParser : IInternalCharParser protected CharParser Parser { get; set; } private char FirstCharOfLineTerminator { get; set; } private char Delimiter { get; set; } - private char EscapeChar { get; set; } + private char? EscapeChar { get; set; } public CharOfFieldParser(CharParser parser) => (Parser, FirstCharOfLineTerminator, Delimiter, EscapeChar) @@ -35,7 +35,7 @@ public virtual ParserState Parse(char c) : ParserState.Continue; } - if (c == EscapeChar) + if (EscapeChar.HasValue && c == EscapeChar) { Parser.Switch(Parser.AfterEscapeChar); return ParserState.Continue; diff --git a/PocketCsvReader/CharParsing/FirstCharOfFieldLookupParser.cs b/PocketCsvReader/CharParsing/FirstCharOfFieldLookupParser.cs index 5c60772..9754f1f 100644 --- a/PocketCsvReader/CharParsing/FirstCharOfFieldLookupParser.cs +++ b/PocketCsvReader/CharParsing/FirstCharOfFieldLookupParser.cs @@ -11,10 +11,10 @@ internal class FirstCharOfFieldLookupParser : IInternalCharParser protected CharParser Parser { get; set; } protected readonly bool[] InterestingChars; private char FirstCharOfLineTerminator { get; set; } - private char QuoteChar { get; set; } + private char? QuoteChar { get; set; } private char Delimiter { get; set; } private bool IsSkipInitialSpace { get; set; } - private char EscapeChar { get; set; } + private char? EscapeChar { get; set; } public FirstCharOfFieldLookupParser(CharParser parser) { @@ -26,8 +26,10 @@ public FirstCharOfFieldLookupParser(CharParser parser) InterestingChars = new bool[char.MaxValue + 1]; InterestingChars[Delimiter] = true; InterestingChars[FirstCharOfLineTerminator] = true; - InterestingChars[EscapeChar] = true; - InterestingChars[QuoteChar] = true; + if (EscapeChar.HasValue) + InterestingChars[EscapeChar.Value] = true; + if (QuoteChar.HasValue) + InterestingChars[QuoteChar.Value] = true; InterestingChars[' '] = IsSkipInitialSpace; } @@ -42,7 +44,7 @@ public virtual ParserState Parse(char c) return ParserState.Continue; } - if (c == QuoteChar) + if (QuoteChar.HasValue && c == QuoteChar) { Parser.SetQuotedField(); Parser.Switch(Parser.FirstCharOfQuotedField); @@ -67,7 +69,7 @@ public virtual ParserState Parse(char c) } - if (c == EscapeChar) + if (EscapeChar.HasValue && c == EscapeChar) { Parser.Switch(Parser.AfterEscapeChar); return ParserState.Continue; diff --git a/PocketCsvReader/CharParsing/FirstCharOfFieldParser.cs b/PocketCsvReader/CharParsing/FirstCharOfFieldParser.cs index 0d8d66d..5b3fcfb 100644 --- a/PocketCsvReader/CharParsing/FirstCharOfFieldParser.cs +++ b/PocketCsvReader/CharParsing/FirstCharOfFieldParser.cs @@ -11,10 +11,10 @@ internal class FirstCharOfFieldParser : IInternalCharParser protected CharParser Parser { get; set; } private char FirstCharOfLineTerminator { get; set; } - private char QuoteChar { get; set; } + private char? QuoteChar { get; set; } private char Delimiter { get; set; } private bool IsSkipInitialSpace { get; set; } - private char EscapeChar { get; set; } + private char? EscapeChar { get; set; } public FirstCharOfFieldParser(CharParser parser) => (Parser, FirstCharOfLineTerminator, QuoteChar, Delimiter, IsSkipInitialSpace, EscapeChar) @@ -26,7 +26,7 @@ public virtual ParserState Parse(char c) { Parser.ResetFieldState(); - if (c == QuoteChar) + if (QuoteChar.HasValue && c == QuoteChar) { Parser.SetQuotedField(); Parser.Switch(Parser.FirstCharOfQuotedField); @@ -50,7 +50,7 @@ public virtual ParserState Parse(char c) return ParserState.Continue; } - if (c == EscapeChar) + if (EscapeChar.HasValue && c == EscapeChar) { Parser.Switch(Parser.AfterEscapeChar); return ParserState.Continue; diff --git a/PocketCsvReader/CharParsing/FirstCharOfQuotedFieldParser.cs b/PocketCsvReader/CharParsing/FirstCharOfQuotedFieldParser.cs index 6ebdd75..b017012 100644 --- a/PocketCsvReader/CharParsing/FirstCharOfQuotedFieldParser.cs +++ b/PocketCsvReader/CharParsing/FirstCharOfQuotedFieldParser.cs @@ -9,8 +9,8 @@ namespace PocketCsvReader.CharParsing; internal class FirstCharOfQuotedFieldParser : IInternalCharParser { protected CharParser Parser { get; set; } - private char QuoteChar { get; set; } - private char EscapeChar { get; set; } + private char? QuoteChar { get; set; } + private char? EscapeChar { get; set; } public FirstCharOfQuotedFieldParser(CharParser parser) => (Parser, QuoteChar, EscapeChar) @@ -20,13 +20,13 @@ public virtual ParserState Parse(char c) { Parser.SetFieldStart(); - if (c == QuoteChar) + if (QuoteChar.HasValue && c == QuoteChar) { Parser.Switch(Parser.AfterQuoteChar); return ParserState.Continue; } - if (c == EscapeChar) + if (EscapeChar.HasValue && c == EscapeChar) { Parser.Switch(Parser.AfterEscapeCharQuotedField); return ParserState.Continue; diff --git a/PocketCsvReader/CharParsing/FirstCharOfRecordParser.cs b/PocketCsvReader/CharParsing/FirstCharOfRecordParser.cs index a5c8ea9..ba48a50 100644 --- a/PocketCsvReader/CharParsing/FirstCharOfRecordParser.cs +++ b/PocketCsvReader/CharParsing/FirstCharOfRecordParser.cs @@ -7,19 +7,33 @@ namespace PocketCsvReader.CharParsing; internal class FirstCharOfRecordParser : FirstCharOfFieldParser { - private char CommentChar { get; set; } + private int[] CommentRows { get; set; } + private int[] HeaderRows { get; set; } + private char? CommentChar { get; set; } public FirstCharOfRecordParser(CharParser parser) - : base(parser) { CommentChar = Parser.Profile.Descriptor.CommentChar; } + : base(parser) + { + CommentChar = Parser.Profile.Descriptor.CommentChar; + CommentRows = Parser.Profile.Descriptor.CommentRows ?? []; + HeaderRows = Parser.Profile.Descriptor.Header ? Parser.Profile.Descriptor.HeaderRows : []; + } public override ParserState Parse(char c) { - if (c == CommentChar) + Parser.RowNumber++; + + if (HeaderRows.Contains(Parser.RowNumber)) + Parser.SetHeaderRow(); + else if (CommentRows.Contains(Parser.RowNumber) || (CommentChar.HasValue && c == CommentChar)) { + Parser.UnsetHeaderRow(); Parser.ZeroField(); Parser.Switch(Parser.Comment); return ParserState.Continue; } + else if (Parser.IsHeaderRow) + Parser.UnsetHeaderRow(); return base.Parse(c); } diff --git a/PocketCsvReader/CharParsing/LineTerminatorParser.cs b/PocketCsvReader/CharParsing/LineTerminatorParser.cs index cae9ef9..5297734 100644 --- a/PocketCsvReader/CharParsing/LineTerminatorParser.cs +++ b/PocketCsvReader/CharParsing/LineTerminatorParser.cs @@ -42,7 +42,7 @@ public ParserState Parse(char c) } public virtual ParserState NextState() - => ParserState.Record; + => Parser.IsHeaderRow ? ParserState.Header : ParserState.Record; public virtual ParserState SetBack() { diff --git a/PocketCsvReader/Configuration/DialectDescriptor.cs b/PocketCsvReader/Configuration/DialectDescriptor.cs new file mode 100644 index 0000000..1e9e3cd --- /dev/null +++ b/PocketCsvReader/Configuration/DialectDescriptor.cs @@ -0,0 +1,29 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace PocketCsvReader +{ + public record DialectDescriptor + ( + string Schema = "https://datapackage.org/profiles/1.0/tabledialect.json", + bool Header = true, + int[] HeaderRows = null!, + string HeaderJoin = " ", + int[]? CommentRows = null, + char? CommentChar = null, + char Delimiter = ',', + string LineTerminator = "\r\n", + char? QuoteChar = '"', + bool DoubleQuote = true, + char? EscapeChar = null, + string? NullSequence = null, + bool SkipInitialSpace = false, + string CsvDdfVersion = "2.0" + ) + { + public int[] HeaderRows { get; init; } = HeaderRows ?? [1]; + } +} diff --git a/PocketCsvReader/Configuration/DialectDescriptorBuilder.cs b/PocketCsvReader/Configuration/DialectDescriptorBuilder.cs index c99ce09..a149255 100644 --- a/PocketCsvReader/Configuration/DialectDescriptorBuilder.cs +++ b/PocketCsvReader/Configuration/DialectDescriptorBuilder.cs @@ -1,5 +1,6 @@ using System; using System.Collections.Generic; +using System.ComponentModel.Design; using System.Linq; using System.Text; using System.Threading.Tasks; @@ -7,14 +8,14 @@ namespace PocketCsvReader.Configuration; public class DialectDescriptorBuilder { - private CsvDialectDescriptor Descriptor { get; } = new(); + private DialectDescriptor Descriptor { get; set; } = new(); public DialectDescriptorBuilder WithDelimiter(char delimiter) - => (Descriptor.Delimiter = delimiter, Builder: this).Builder; + => (Descriptor = Descriptor with { Delimiter = delimiter }, Builder: this).Builder; public DialectDescriptorBuilder WithDelimiter(Delimiter delimiter) => WithDelimiter((char)delimiter); public DialectDescriptorBuilder WithLineTerminator(string lineTerminator) - => (Descriptor.LineTerminator = lineTerminator, Builder: this).Builder; + => (Descriptor = Descriptor with { LineTerminator = lineTerminator }, Builder: this).Builder; public DialectDescriptorBuilder WithLineTerminator(LineTerminator lineTerminator) { var terminator = lineTerminator switch @@ -26,42 +27,65 @@ public DialectDescriptorBuilder WithLineTerminator(LineTerminator lineTerminator }; return WithLineTerminator(terminator); } - public DialectDescriptorBuilder WithQuoteChar(char quoteChar) - => (Descriptor.QuoteChar = quoteChar, Builder: this).Builder; + => (Descriptor = Descriptor with { QuoteChar = quoteChar }, Builder: this).Builder; public DialectDescriptorBuilder WithQuoteChar(QuoteChar quoteChar) => WithQuoteChar((char)quoteChar); + public DialectDescriptorBuilder WithoutQuoteChar() + => (Descriptor = Descriptor with { QuoteChar = null }, Builder: this).Builder; public DialectDescriptorBuilder WithDoubleQuote(bool doubleQuote = true) - => (Descriptor.DoubleQuote = doubleQuote, Builder: this).Builder; + => (Descriptor = Descriptor with { DoubleQuote = doubleQuote }, Builder: this).Builder; public DialectDescriptorBuilder WithoutDoubleQuote() => WithDoubleQuote(false); public DialectDescriptorBuilder WithEscapeChar(char escapeChar) - => (Descriptor.EscapeChar = escapeChar, Builder: this).Builder; + => (Descriptor = Descriptor with { EscapeChar = escapeChar}, Builder: this).Builder; public DialectDescriptorBuilder WithEscapeChar(EscapeChar escapeChar) => WithEscapeChar((char)escapeChar); + public DialectDescriptorBuilder WithoutEscapeChar() + => (Descriptor = Descriptor with { EscapeChar = null }, Builder: this).Builder; public DialectDescriptorBuilder WithNullSequence(string? nullSequence) - => (Descriptor.NullSequence = nullSequence, Builder: this).Builder; + => (Descriptor = Descriptor with { NullSequence = nullSequence}, Builder: this).Builder; public DialectDescriptorBuilder WithoutNullSequence() => WithNullSequence(null); public DialectDescriptorBuilder WithSkipInitialSpace(bool skipInitialSpace = true) - => (Descriptor.SkipInitialSpace = skipInitialSpace, Builder: this).Builder; + => (Descriptor = Descriptor with { SkipInitialSpace = skipInitialSpace}, Builder: this).Builder; public DialectDescriptorBuilder WithoutSkipInitialSpace() => WithSkipInitialSpace(false); public DialectDescriptorBuilder WithHeader(bool header = true) - => (Descriptor.Header = header, Builder: this).Builder; + { + if (header != Descriptor.Header) + { + Descriptor = Descriptor with + { + Header = header, + HeaderRows = header ? [1] : [] + }; + } + return this; + } public DialectDescriptorBuilder WithoutHeader() => WithHeader(false); + public DialectDescriptorBuilder WithHeaderJoin(string join) + => (Descriptor = Descriptor with { HeaderJoin = join }, Builder: this).Builder; + public DialectDescriptorBuilder WithHeaderRows(int[] headerRows) + { + if (headerRows.Length == 0) + return WithoutHeader(); + return (Descriptor = Descriptor with { HeaderRows = headerRows }, Builder: this).Builder; + } + public DialectDescriptorBuilder WithoutHeaderRows() + => WithHeaderRows([]); public DialectDescriptorBuilder WithCommentChar(char commentChar) - => (Descriptor.CommentChar = commentChar, Builder: this).Builder; + => (Descriptor = Descriptor with { CommentChar = commentChar}, Builder: this).Builder; public DialectDescriptorBuilder WithCommentChar(CommentChar commentChar) => WithCommentChar((char)commentChar); - public DialectDescriptorBuilder WithCaseSensitiveHeader(bool caseSensitiveHeader = true) - => (Descriptor.CaseSensitiveHeader = caseSensitiveHeader, Builder: this).Builder; - public DialectDescriptorBuilder WithoutCaseSensitiveHeader() - => WithCaseSensitiveHeader(false); + public DialectDescriptorBuilder WithCommentRows(int[] commentRows) + => (Descriptor = Descriptor with { CommentRows = commentRows }, Builder: this).Builder; + public DialectDescriptorBuilder WithoutCommentRows() + => WithCommentRows([]); public DialectDescriptorBuilder WithCsvDdfVersion(string version) - => (Descriptor.CsvDdfVersion = version, Builder: this).Builder; + => (Descriptor = Descriptor with { CsvDdfVersion = version}, Builder: this).Builder; - public CsvDialectDescriptor Build() + public DialectDescriptor Build() => Descriptor; } diff --git a/PocketCsvReader/CsvArrayString.cs b/PocketCsvReader/CsvArrayString.cs index cd10507..1c4e821 100644 --- a/PocketCsvReader/CsvArrayString.cs +++ b/PocketCsvReader/CsvArrayString.cs @@ -75,12 +75,27 @@ public void Initialize() } } - private void RegisterHeader(string?[] names, string prefix) + private void RegisterHeader(string?[][] headers, string unamedPrefix) { + var maxField = headers.Select(x => x.Length).Max(); + var names = (string[])Array.CreateInstance(typeof(string), maxField); + + foreach (var header in headers) + { + var last = string.Empty; + for (int i = 0; i < maxField; i++) + { + if (i < header.Length && !string.IsNullOrEmpty(header[i])) + last = header[i]; + names[i] = string.IsNullOrEmpty(names[i]) + ? $"{last}" + : $"{names[i]}{Profile.Descriptor.HeaderJoin}{last}"; + } + } int unnamedFieldIndex = 0; Fields = (RecordParser!.Profile.Descriptor.Header - ? names.Select(value => { unnamedFieldIndex++; return string.IsNullOrWhiteSpace(value) ? $"{prefix}{unnamedFieldIndex}" : value; }) - : names.Select(_ => $"{prefix}{unnamedFieldIndex++}")).ToArray(); + ? names.Select(value => { unnamedFieldIndex++; return string.IsNullOrWhiteSpace(value) ? $"{unamedPrefix}{unnamedFieldIndex}" : value; }) + : names.Select(_ => $"{unamedPrefix}{unnamedFieldIndex++}")).ToArray(); } public void Dispose() diff --git a/PocketCsvReader/CsvDataReader.cs b/PocketCsvReader/CsvDataReader.cs index b1275e5..6762c5d 100644 --- a/PocketCsvReader/CsvDataReader.cs +++ b/PocketCsvReader/CsvDataReader.cs @@ -65,7 +65,7 @@ public bool Read() IsEof = RecordParser!.ReadNextRecord(out RecordSpan rawRecord); if (RowCount == 0 && !RecordParser!.Profile.Descriptor.Header) - RegisterHeader((string?[])Array.CreateInstance(typeof(string), rawRecord.FieldSpans.Length), "field_"); + RegisterHeader([(string?[])Array.CreateInstance(typeof(string), rawRecord.FieldSpans.Length)], "field_"); if (rawRecord.FieldSpans.Length == 0) { @@ -82,12 +82,27 @@ public bool Read() return true; } - private void RegisterHeader(string?[] names, string prefix) + private void RegisterHeader(string?[][] headers, string unamedPrefix) { + var maxField = headers.Select(x => x.Length).Max(); + var names = (string[])Array.CreateInstance(typeof(string), maxField); + + foreach (var header in headers) + { + var last = string.Empty; + for (int i = 0; i < maxField; i++) + { + if (i < header.Length && !string.IsNullOrEmpty(header[i])) + last = header[i]; + names[i] = string.IsNullOrEmpty(names[i]) + ? $"{last}" + : $"{names[i]}{Profile.Descriptor.HeaderJoin}{last}"; + } + } int unnamedFieldIndex = 0; Fields = (RecordParser!.Profile.Descriptor.Header - ? names.Select(value => { unnamedFieldIndex++; return string.IsNullOrWhiteSpace(value) ? $"{prefix}{unnamedFieldIndex}" : value; }) - : names.Select(_ => $"{prefix}{unnamedFieldIndex++}")).ToArray(); + ? names.Select(value => { unnamedFieldIndex++; return string.IsNullOrWhiteSpace(value) ? $"{unamedPrefix}{unnamedFieldIndex}" : value; }) + : names.Select(_ => $"{unamedPrefix}{unnamedFieldIndex++}")).ToArray(); } private void HandleUnexpectedFields(int expectedLength) diff --git a/PocketCsvReader/CsvDialectDescriptor.cs b/PocketCsvReader/CsvDialectDescriptor.cs deleted file mode 100644 index ea310a6..0000000 --- a/PocketCsvReader/CsvDialectDescriptor.cs +++ /dev/null @@ -1,23 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace PocketCsvReader -{ - public class CsvDialectDescriptor - { - public char Delimiter { get; internal set; } = ','; - public string LineTerminator { get; internal set; } = "\r\n"; - public char QuoteChar { get; internal set; } = '"'; - public bool DoubleQuote { get; internal set; } = false; //should be true? - public char EscapeChar { get; internal set; } - public string? NullSequence { get; internal set; } = null; - public bool SkipInitialSpace { get; internal set; } = false; - public bool Header { get; internal set; } = true; - public char CommentChar { get; internal set; } - public bool CaseSensitiveHeader { get; internal set; } = false; - public string CsvDdfVersion { get; internal set; } = "1.0"; - } -} diff --git a/PocketCsvReader/CsvProfile.cs b/PocketCsvReader/CsvProfile.cs index 072e986..06cf27f 100644 --- a/PocketCsvReader/CsvProfile.cs +++ b/PocketCsvReader/CsvProfile.cs @@ -5,7 +5,7 @@ namespace PocketCsvReader; public class CsvProfile { - public CsvDialectDescriptor Descriptor { get; private set; } + public DialectDescriptor Descriptor { get; private set; } public ParserOptimizationOptions ParserOptimizations { get; set; } public Dictionary Sequences { get; } = new(); @@ -60,11 +60,8 @@ public CsvProfile(char fieldSeparator, char textQualifier, char escapeTextQualif MissingCell = missingCell; } - public CsvProfile(CsvDialectDescriptor descriptor) + public CsvProfile(DialectDescriptor descriptor) { - if (descriptor.CaseSensitiveHeader) - throw new ArgumentException("PocketCsvReader doesn't support caseSensitiveHeader set to true in the CSV dialect descriptor."); - if (descriptor.NullSequence is not null) Sequences.Add(descriptor.NullSequence, null); diff --git a/PocketCsvReader/CsvReader.cs b/PocketCsvReader/CsvReader.cs index 5a01745..f70af66 100644 --- a/PocketCsvReader/CsvReader.cs +++ b/PocketCsvReader/CsvReader.cs @@ -20,7 +20,7 @@ public class CsvReader protected IEncodingDetector EncodingDetector { get; set; } = new EncodingDetector(); protected internal CsvProfile Profile { get; private set; } - public CsvDialectDescriptor Dialect { get => Profile.Descriptor; } + public DialectDescriptor Dialect { get => Profile.Descriptor; } protected int BufferSize { get; private set; } diff --git a/PocketCsvReader/FieldEscaper.cs b/PocketCsvReader/FieldEscaper.cs index 35dfc65..c3a0530 100644 --- a/PocketCsvReader/FieldEscaper.cs +++ b/PocketCsvReader/FieldEscaper.cs @@ -9,8 +9,8 @@ namespace PocketCsvReader; internal class FieldEscaper { protected ArrayPool? Pool { get; } - protected char QuoteChar { get; } - protected char EscapeChar { get; } + protected char? QuoteChar { get; } + protected char? EscapeChar { get; } protected char Delimiter { get; } protected bool DoubleQuote { get; } @@ -18,7 +18,7 @@ public FieldEscaper(CsvProfile Profile, ArrayPool? pool = null) : this(Profile.Descriptor.QuoteChar, Profile.Descriptor.DoubleQuote, Profile.Descriptor.EscapeChar, Profile.Descriptor.Delimiter, pool) { } - public FieldEscaper(char quoteChar, bool doubleQuote, char escapeChar, char delimiter, ArrayPool? pool = null) + public FieldEscaper(char? quoteChar, bool doubleQuote, char? escapeChar, char delimiter, ArrayPool? pool = null) => (QuoteChar, DoubleQuote, EscapeChar, Delimiter, Pool) = (quoteChar, doubleQuote, escapeChar, delimiter, pool); public ReadOnlySpan Escape(ReadOnlySpan value) diff --git a/PocketCsvReader/RecordParser.cs b/PocketCsvReader/RecordParser.cs index 7efa515..950c4d0 100644 --- a/PocketCsvReader/RecordParser.cs +++ b/PocketCsvReader/RecordParser.cs @@ -30,24 +30,6 @@ public RecordParser(StreamReader reader, CsvProfile profile, ArrayPool? po protected RecordParser(CsvProfile profile, IBufferReader buffer, ArrayPool? pool) => (Profile, Reader, CharParser) = (profile, buffer, new(profile)); - internal bool ReadNextArray(out string?[]? value) - { - var eof = ReadNextRecord(out RecordSpan rawRecord); - var arrayStringMapper = new SpanMapper((span, fieldSpans) => - { - var values = new string?[fieldSpans.Count()]; - var index = 0; - foreach (var fieldSpan in fieldSpans) - { - var value = span.Slice(fieldSpan.Start, fieldSpan.Length); - values[index++] = value.Length == 0 ? null : value.ToString(); - } - return values; - }); - value = rawRecord.FieldSpans.Length == 0 ? null : arrayStringMapper(rawRecord.Span, rawRecord.FieldSpans); - return eof; - } - public virtual bool ReadNextRecord(out RecordSpan record) { var index = 0; @@ -69,16 +51,16 @@ public virtual bool ReadNextRecord(out RecordSpan record) { char c = span[index]; var state = CharParser.Parse(c); - if (state == ParserState.Field || state == ParserState.Record) + if (state == ParserState.Field || state == ParserState.Record || state == ParserState.Header) { fieldList.Add(new FieldSpan(CharParser.FieldStart, CharParser.FieldLength, CharParser.IsEscapedField, CharParser.IsQuotedField)); - if (state == ParserState.Record) + if (state == ParserState.Record || state == ParserState.Header) { CharParser.Reset(); Buffer = Buffer.Slice(index + 1); FieldsCount ??= fieldList.Count; - record = new RecordSpan( + record = new RecordSpan( Profile , longSpan.Length > 0 ? (ReadOnlySpan)(longSpan.Concat(span)) : span , [.. fieldList]); @@ -113,6 +95,7 @@ public virtual bool ReadNextRecord(out RecordSpan record) switch (CharParser.ParseEof()) { + case ParserState.Header: case ParserState.Record: fieldList.Add(new FieldSpan(CharParser.FieldStart, CharParser.FieldLength, CharParser.IsEscapedField, CharParser.IsQuotedField)); record = new RecordSpan( @@ -130,8 +113,11 @@ public virtual bool ReadNextRecord(out RecordSpan record) } } - public virtual string[] ReadHeaders() + public virtual string[][] ReadHeaders() { + if (!Profile.Descriptor.Header) + return []; + var headerMapper = new SpanMapper((span, fieldSpans) => { var headers = new string[fieldSpans.Count()]; @@ -141,16 +127,19 @@ public virtual string[] ReadHeaders() return headers; }); - var unnamedFieldIndex = -1; - ReadNextRecord(out RecordSpan rawRecord); - var fields = rawRecord.FieldSpans.Length == 0 ? [] : headerMapper(rawRecord.Span, rawRecord.FieldSpans); - return fields.Select(value => - { - unnamedFieldIndex++; - return string.IsNullOrWhiteSpace(value) || !Profile.Descriptor.Header - ? $"field_{unnamedFieldIndex}" - : value!; - }).ToArray(); + var headerList = new List(); + var rowCount = 1; + while (rowCount <= Profile.Descriptor.HeaderRows.Max()) + { + ReadNextRecord(out RecordSpan rawRecord); + if (Profile.Descriptor.HeaderRows.Contains(rowCount)) + { + var fields = rawRecord.FieldSpans.Length == 0 ? [] : headerMapper(rawRecord.Span, rawRecord.FieldSpans); + headerList.Add(fields); + } + rowCount++; + } + return [.. headerList]; } public int? CountRecords() @@ -227,7 +216,7 @@ public virtual string GetFirstRecord() index -= Profile.Descriptor.LineTerminator.Length - 1; break; } - + index++; } CharParser.Reset(); diff --git a/docs/_docs/csv-dialect-descriptor.md b/docs/_docs/csv-dialect-descriptor.md index ee40001..f08d9cf 100644 --- a/docs/_docs/csv-dialect-descriptor.md +++ b/docs/_docs/csv-dialect-descriptor.md @@ -67,19 +67,44 @@ The description of PocketCsvReader is aligned with the [CSV Dialect Specificatio ## `Header` -- **Description:** Indicates whether the first row of the CSV contains column headers. +- **Description:** Specifies whether the first row(s) of the CSV contains column headers. - **Default Value:** `true`. - **Tuning:** - Set to `false` if your CSV does not include headers. - Useful for datasets where all rows are data. +## `HeaderRows` + +- **Description:** Specifies the row indexes that contain the headers. +- **Default Value:** `[1]`. +- **Tuning:** + - Set to `[1,2]` if your CSV defines headers across the first two rows. + - Set to `[2,3]` if your CSV defines headers across the second and third two rows. The first row is ignored. + - Useful for datasets where multiple rows combine to define the headers. + +## `HeaderJoin` + +- **Description:** Specifies the separator used to combine fields when headers span multiple rows. +- **Default Value:** ` ` (concatenates headers without a space as separator). +- **Tuning:** + - Set to . to produce fields like `fruit.id` and `fruit.name`. + - Useful for datasets with multi-line headers that need to be merged into a single row of header fields. + ## `CommentChar` -- **Description:** The character used to denote comments in the CSV file. +- **Description:** The character used to denote comments in the CSV file. Must be the first character of the row. - **Default Value:** `null` (null, indicating no comments). - **Tuning:** - Specify a comment character (e.g., `'#'` or `';'`) to skip lines starting with that character. +## `CommentRows` + +- **Description:** Specifies row indexes that are treated as comments, regardless of whether `commentChar` is set. +- **Default Value:** `[]` (no specific rows are treated as comments). +- **Tuning:** + - Specify indexes like `[1,3,4]` to treat the first, third, and fourth rows as comments. + - Useful for skipping predefined rows that do not contain data. + ## `CaseSensitiveHeader` - **Description:** Indicates whether header names should be treated as case-sensitive. diff --git a/docs/_docs/fluent-api-profile-configuration.md b/docs/_docs/fluent-api-profile-configuration.md index 6ce11d8..d4c6b39 100644 --- a/docs/_docs/fluent-api-profile-configuration.md +++ b/docs/_docs/fluent-api-profile-configuration.md @@ -81,16 +81,20 @@ The `DialectDescriptorBuilder` class allows you to configure a CSV dialect descr | `WithDelimiter(char delimiter)` | Sets the delimiter character used in the CSV. | | `WithLineTerminator(string line)` | Sets the line terminator (e.g., `"\r\n"` for Windows or `"\n"` for Unix). | | `WithQuoteChar(char quoteChar)` | Sets the character used for quoting fields. | +| `WithoutQuoteChar()` | Unsets the quote character used in the CSV. | | `WithDoubleQuote(bool doubleQuote)` | Enables or disables double quoting for fields containing special characters. | | `WithoutDoubleQuote()` | Disables double quoting (same as calling `WithDoubleQuote(false)`). | | `WithEscapeChar(char escapeChar)` | Sets the escape character used in the CSV. | +| `WithoutEscapeChar()` | Unsets the escape character used in the CSV. | | `WithNullSequence(string? nullSeq)` | Defines a sequence used to represent `null` values in the CSV. | | `WithoutNullSequence()` | Removes the null sequence (same as calling `WithNullSequence(null)`). | | `WithSkipInitialSpace(bool skip)` | Enables or disables skipping spaces after the delimiter. | | `WithoutSkipInitialSpace()` | Disables skipping spaces (same as calling `WithSkipInitialSpace(false)`). | | `WithHeader(bool header)` | Enables or disables the inclusion of a header row. | | `WithoutHeader()` | Disables headers (same as calling `WithHeader(false)`). | -| `WithCommentChar(char commentChar)` | Sets the character used to denote comments in the CSV. | +| `WithHeaderRows(int[] rows)` | Enables headers and set the indexes of header rows. | +| `WithHeaderJoin(string join)` | Set the string to join fields from different rows to create the header. | +| `WithCommentChar(char commentChar)` | Sets the character used to denote comments in the CSV. | +| `WithCommentRows(int[] rows)` | Set the indexes of comment rows. | | `WithCaseSensitiveHeader(bool cs)` | Enables or disables case sensitivity for header fields. | -| `WithoutCaseSensitiveHeader()` | Sets case sensitivity for headers to `false` (same as `WithCaseSensitiveHeader(false)`). | | `WithCsvDdfVersion(string version)` | Sets the version of the CSV DDF (Data Definition Format). |