From be9fcf8c67e005849faab3759f246cdbf815d74b Mon Sep 17 00:00:00 2001 From: James Barnett Date: Wed, 31 Jul 2024 14:23:17 -0400 Subject: [PATCH] Revert "BED-4463: Implement BOM encoding package and refactor WriteAndValidateJSON for improved Unicode handling (#678)" (#757) This reverts commit b5bf10dbbb7dc9c262d395d31aa3ae8ae353af51. --- cmd/api/src/api/v2/file_uploads.go | 17 +- .../api/v2/file_uploads_integration_test.go | 15 +- .../src/services/fileupload/file_upload.go | 34 +- .../services/fileupload/file_upload_test.go | 124 ++----- cmd/api/src/services/fileupload/validation.go | 5 +- go.work | 1 - packages/go/bomenc/encodings.go | 144 -------- packages/go/bomenc/encodings_test.go | 221 ----------- packages/go/bomenc/go.mod | 30 -- packages/go/bomenc/go.sum | 8 - packages/go/bomenc/normalize.go | 152 -------- packages/go/bomenc/normalize_test.go | 347 ------------------ packages/go/bomenc/utf16.go | 130 ------- packages/go/bomenc/utf16_test.go | 216 ----------- packages/go/bomenc/utf32.go | 133 ------- packages/go/bomenc/utf32_test.go | 153 -------- 16 files changed, 72 insertions(+), 1658 deletions(-) delete mode 100644 packages/go/bomenc/encodings.go delete mode 100644 packages/go/bomenc/encodings_test.go delete mode 100644 packages/go/bomenc/go.mod delete mode 100644 packages/go/bomenc/go.sum delete mode 100644 packages/go/bomenc/normalize.go delete mode 100644 packages/go/bomenc/normalize_test.go delete mode 100644 packages/go/bomenc/utf16.go delete mode 100644 packages/go/bomenc/utf16_test.go delete mode 100644 packages/go/bomenc/utf32.go delete mode 100644 packages/go/bomenc/utf32_test.go diff --git a/cmd/api/src/api/v2/file_uploads.go b/cmd/api/src/api/v2/file_uploads.go index 5854df04bd..be88ea9c5b 100644 --- a/cmd/api/src/api/v2/file_uploads.go +++ b/cmd/api/src/api/v2/file_uploads.go @@ -19,24 +19,21 @@ package v2 import ( "errors" "fmt" - "mime" - "net/http" - "slices" - "strconv" - "strings" - "github.com/gorilla/mux" - "github.com/specterops/bloodhound/bomenc" "github.com/specterops/bloodhound/headers" "github.com/specterops/bloodhound/log" "github.com/specterops/bloodhound/src/api" "github.com/specterops/bloodhound/src/auth" "github.com/specterops/bloodhound/src/ctx" "github.com/specterops/bloodhound/src/model" + ingestModel "github.com/specterops/bloodhound/src/model/ingest" "github.com/specterops/bloodhound/src/services/fileupload" "github.com/specterops/bloodhound/src/services/ingest" - - ingestModel "github.com/specterops/bloodhound/src/model/ingest" + "mime" + "net/http" + "slices" + "strconv" + "strings" ) const FileUploadJobIdPathParameterName = "file_upload_job_id" @@ -138,7 +135,7 @@ func (s Resources) ProcessFileUpload(response http.ResponseWriter, request *http api.WriteErrorResponse(request.Context(), api.BuildErrorResponse(http.StatusBadRequest, api.ErrorResponseDetailsIDMalformed, request), response) } else if fileUploadJob, err := fileupload.GetFileUploadJobByID(request.Context(), s.DB, int64(fileUploadJobID)); err != nil { api.HandleDatabaseError(request, response, err) - } else if fileName, fileType, err := fileupload.SaveIngestFile(s.Config.TempDirectory(), request); errors.Is(err, bomenc.ErrUnknownEncodingInvalidUTF8) { + } else if fileName, fileType, err := fileupload.SaveIngestFile(s.Config.TempDirectory(), request); errors.Is(err, fileupload.ErrInvalidJSON) { api.WriteErrorResponse(request.Context(), api.BuildErrorResponse(http.StatusBadRequest, fmt.Sprintf("Error saving ingest file: %v", err), request), response) } else if err != nil { api.WriteErrorResponse(request.Context(), api.BuildErrorResponse(http.StatusInternalServerError, fmt.Sprintf("Error saving ingest file: %v", err), request), response) diff --git a/cmd/api/src/api/v2/file_uploads_integration_test.go b/cmd/api/src/api/v2/file_uploads_integration_test.go index 292641a745..ecb85fcc1e 100644 --- a/cmd/api/src/api/v2/file_uploads_integration_test.go +++ b/cmd/api/src/api/v2/file_uploads_integration_test.go @@ -23,13 +23,12 @@ import ( "bytes" "compress/gzip" "fmt" + "github.com/specterops/bloodhound/mediatypes" + "github.com/specterops/bloodhound/src/services/fileupload" "io" "net/http" "testing" - "github.com/specterops/bloodhound/bomenc" - "github.com/specterops/bloodhound/mediatypes" - "github.com/specterops/bloodhound/headers" "github.com/specterops/bloodhound/src/api/v2/integration" "github.com/specterops/bloodhound/src/test/fixtures/fixtures" @@ -170,7 +169,7 @@ func Test_FileUploadWorkFlowVersion5(t *testing.T) { "v5/ingest/sessions.json", }) - // Assert that we created stuff we expected + //Assert that we created stuff we expected testCtx.AssertIngest(fixtures.IngestAssertions) } @@ -189,7 +188,7 @@ func Test_FileUploadWorkFlowVersion6(t *testing.T) { "v6/ingest/sessions.json", }) - // Assert that we created stuff we expected + //Assert that we created stuff we expected testCtx.AssertIngest(fixtures.IngestAssertions) testCtx.AssertIngest(fixtures.IngestAssertionsv6) testCtx.AssertIngest(fixtures.PropertyAssertions) @@ -240,7 +239,7 @@ func Test_CompressedFileUploadWorkFlowVersion5(t *testing.T) { "v5/ingest/sessions.json", }) - // Assert that we created stuff we expected + //Assert that we created stuff we expected testCtx.AssertIngest(fixtures.IngestAssertions) testCtx.AssertIngest(fixtures.PropertyAssertions) } @@ -260,7 +259,7 @@ func Test_CompressedFileUploadWorkFlowVersion6(t *testing.T) { "v6/ingest/sessions.json", }) - // Assert that we created stuff we expected + //Assert that we created stuff we expected testCtx.AssertIngest(fixtures.IngestAssertions) testCtx.AssertIngest(fixtures.IngestAssertionsv6) testCtx.AssertIngest(fixtures.PropertyAssertions) @@ -269,5 +268,5 @@ func Test_CompressedFileUploadWorkFlowVersion6(t *testing.T) { func Test_BadFileUploadError(t *testing.T) { testCtx := integration.NewFOSSContext(t) - testCtx.SendInvalidFileIngest("v6/ingest/jker.jpg", bomenc.ErrUnknownEncodingInvalidUTF8) + testCtx.SendInvalidFileIngest("v6/ingest/jker.jpg", fileupload.ErrInvalidJSON) } diff --git a/cmd/api/src/services/fileupload/file_upload.go b/cmd/api/src/services/fileupload/file_upload.go index b499702a4c..4cb7dff183 100644 --- a/cmd/api/src/services/fileupload/file_upload.go +++ b/cmd/api/src/services/fileupload/file_upload.go @@ -19,20 +19,17 @@ package fileupload import ( "bufio" - "bytes" "context" "errors" "fmt" - "io" - "net/http" - "os" - "time" - - "github.com/specterops/bloodhound/bomenc" "github.com/specterops/bloodhound/headers" "github.com/specterops/bloodhound/mediatypes" "github.com/specterops/bloodhound/src/model/ingest" "github.com/specterops/bloodhound/src/utils" + "io" + "net/http" + "os" + "time" "github.com/specterops/bloodhound/log" "github.com/specterops/bloodhound/src/model" @@ -40,6 +37,12 @@ import ( const jobActivityTimeout = time.Minute * 20 +const ( + UTF8BOM1 = 0xef + UTF8BOM2 = 0xbb + UTF8BMO3 = 0xbf +) + var ErrInvalidJSON = errors.New("file is not valid json") type FileUploadData interface { @@ -117,14 +120,17 @@ func WriteAndValidateZip(src io.Reader, dst io.Writer) error { func WriteAndValidateJSON(src io.Reader, dst io.Writer) error { tr := io.TeeReader(src, dst) - normalizedContent, err := bomenc.NormalizeToUTF8(bufio.NewReader(tr)) - if err != nil { + bufReader := bufio.NewReader(tr) + if b, err := bufReader.Peek(3); err != nil { return err + } else { + if b[0] == UTF8BOM1 && b[1] == UTF8BOM2 && b[2] == UTF8BMO3 { + if _, err := bufReader.Discard(3); err != nil { + return err + } + } } - _, err = ValidateMetaTag( - bytes.NewReader(normalizedContent.NormalizedContent()), - true, - ) + _, err := ValidateMetaTag(bufReader, true) return err } @@ -140,7 +146,7 @@ func SaveIngestFile(location string, request *http.Request) (string, model.FileT } else if utils.HeaderMatches(request.Header, headers.ContentType.String(), ingest.AllowedZipFileUploadTypes...) { return tempFile.Name(), model.FileTypeZip, WriteAndValidateFile(fileData, tempFile, WriteAndValidateZip) } else { - // We should never get here since this is checked a level above + //We should never get here since this is checked a level above return "", model.FileTypeJson, fmt.Errorf("invalid content type for ingest file") } } diff --git a/cmd/api/src/services/fileupload/file_upload_test.go b/cmd/api/src/services/fileupload/file_upload_test.go index 0b8bf3fe3c..7944e259bb 100644 --- a/cmd/api/src/services/fileupload/file_upload_test.go +++ b/cmd/api/src/services/fileupload/file_upload_test.go @@ -18,19 +18,50 @@ package fileupload import ( "bytes" - "errors" + "github.com/specterops/bloodhound/src/model/ingest" + "github.com/stretchr/testify/assert" "io" "os" "strings" "testing" - - "github.com/specterops/bloodhound/src/model/ingest" - "github.com/stretchr/testify/assert" ) +func TestWriteAndValidateJSON(t *testing.T) { + t.Run("trigger invalid json on bad json", func(t *testing.T) { + var ( + writer = bytes.Buffer{} + badJSON = strings.NewReader("{[]}") + ) + err := WriteAndValidateJSON(badJSON, &writer) + assert.ErrorIs(t, err, ErrInvalidJSON) + }) + + t.Run("succeed on good json", func(t *testing.T) { + var ( + writer = bytes.Buffer{} + goodJSON = strings.NewReader(`{"meta": {"methods": 0, "type": "sessions", "count": 0, "version": 5}, "data": []}`) + ) + err := WriteAndValidateJSON(goodJSON, &writer) + assert.Nil(t, err) + }) + + t.Run("succeed on utf-8 BOM json", func(t *testing.T) { + var ( + writer = bytes.Buffer{} + ) + + file, err := os.Open("../../test/fixtures/fixtures/utf8bomjson.json") + assert.Nil(t, err) + err = WriteAndValidateJSON(io.Reader(file), &writer) + assert.Nil(t, err) + }) +} + func TestWriteAndValidateZip(t *testing.T) { t.Run("valid zip file is ok", func(t *testing.T) { - writer := bytes.Buffer{} + var ( + writer = bytes.Buffer{} + ) file, err := os.Open("../../test/fixtures/fixtures/goodzip.zip") assert.Nil(t, err) @@ -49,86 +80,3 @@ func TestWriteAndValidateZip(t *testing.T) { assert.Equal(t, err, ingest.ErrInvalidZipFile) }) } - -func TestWriteAndValidateJSON(t *testing.T) { - tests := []struct { - name string - input []byte - expectedOutput []byte - expectedError error - }{ - { - name: "UTF-8 without BOM", - input: []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}, "data": [{"domain": "example.com"}]}`), - expectedOutput: []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}, "data": [{"domain": "example.com"}]}`), - expectedError: nil, - }, - { - name: "UTF-8 with BOM", - input: append([]byte{0xEF, 0xBB, 0xBF}, []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}, "data": [{"domain": "example.com"}]}`)...), - expectedOutput: append([]byte{0xEF, 0xBB, 0xBF}, []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}, "data": [{"domain": "example.com"}]}`)...), - expectedError: nil, - }, - { - name: "UTF-16BE with BOM", - input: []byte{0xFE, 0xFF, 0x00, 0x7B, 0x00, 0x22, 0x00, 0x6D, 0x00, 0x65, 0x00, 0x74, 0x00, 0x61, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x7B, 0x00, 0x22, 0x00, 0x74, 0x00, 0x79, 0x00, 0x70, 0x00, 0x65, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x22, 0x00, 0x64, 0x00, 0x6F, 0x00, 0x6D, 0x00, 0x61, 0x00, 0x69, 0x00, 0x6E, 0x00, 0x73, 0x00, 0x22, 0x00, 0x2C, 0x00, 0x20, 0x00, 0x22, 0x00, 0x76, 0x00, 0x65, 0x00, 0x72, 0x00, 0x73, 0x00, 0x69, 0x00, 0x6F, 0x00, 0x6E, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x34, 0x00, 0x2C, 0x00, 0x20, 0x00, 0x22, 0x00, 0x63, 0x00, 0x6F, 0x00, 0x75, 0x00, 0x6E, 0x00, 0x74, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x31, 0x00, 0x7D, 0x00, 0x2C, 0x00, 0x20, 0x00, 0x22, 0x00, 0x64, 0x00, 0x61, 0x00, 0x74, 0x00, 0x61, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x5B, 0x00, 0x7B, 0x00, 0x22, 0x00, 0x64, 0x00, 0x6F, 0x00, 0x6D, 0x00, 0x61, 0x00, 0x69, 0x00, 0x6E, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x22, 0x00, 0x65, 0x00, 0x78, 0x00, 0x61, 0x00, 0x6D, 0x00, 0x70, 0x00, 0x6C, 0x00, 0x65, 0x00, 0x2E, 0x00, 0x63, 0x00, 0x6F, 0x00, 0x6D, 0x00, 0x22, 0x00, 0x7D, 0x00, 0x5D, 0x00, 0x7D}, - expectedOutput: []byte{0xFE, 0xFF, 0x00, 0x7B, 0x00, 0x22, 0x00, 0x6D, 0x00, 0x65, 0x00, 0x74, 0x00, 0x61, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x7B, 0x00, 0x22, 0x00, 0x74, 0x00, 0x79, 0x00, 0x70, 0x00, 0x65, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x22, 0x00, 0x64, 0x00, 0x6F, 0x00, 0x6D, 0x00, 0x61, 0x00, 0x69, 0x00, 0x6E, 0x00, 0x73, 0x00, 0x22, 0x00, 0x2C, 0x00, 0x20, 0x00, 0x22, 0x00, 0x76, 0x00, 0x65, 0x00, 0x72, 0x00, 0x73, 0x00, 0x69, 0x00, 0x6F, 0x00, 0x6E, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x34, 0x00, 0x2C, 0x00, 0x20, 0x00, 0x22, 0x00, 0x63, 0x00, 0x6F, 0x00, 0x75, 0x00, 0x6E, 0x00, 0x74, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x31, 0x00, 0x7D, 0x00, 0x2C, 0x00, 0x20, 0x00, 0x22, 0x00, 0x64, 0x00, 0x61, 0x00, 0x74, 0x00, 0x61, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x5B, 0x00, 0x7B, 0x00, 0x22, 0x00, 0x64, 0x00, 0x6F, 0x00, 0x6D, 0x00, 0x61, 0x00, 0x69, 0x00, 0x6E, 0x00, 0x22, 0x00, 0x3A, 0x00, 0x20, 0x00, 0x22, 0x00, 0x65, 0x00, 0x78, 0x00, 0x61, 0x00, 0x6D, 0x00, 0x70, 0x00, 0x6C, 0x00, 0x65, 0x00, 0x2E, 0x00, 0x63, 0x00, 0x6F, 0x00, 0x6D, 0x00, 0x22, 0x00, 0x7D, 0x00, 0x5D, 0x00, 0x7D}, - expectedError: nil, - }, - { - name: "Missing meta tag", - input: []byte(`{"data": [{"domain": "example.com"}]}`), - expectedOutput: []byte(`{"data": [{"domain": "example.com"}]}`), - expectedError: ingest.ErrMetaTagNotFound, - }, - { - name: "Missing data tag", - input: []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}}`), - expectedOutput: []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}}`), - expectedError: ingest.ErrDataTagNotFound, - }, - // NOTE: this test discovers a bug where invalid JSON files are not being invalidated due to the current - // implemenation of ValidateMetaTag of decoding each token. - // { - // name: "Invalid JSON", - // input: []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}, "data": [{"domain": "example.com"`), - // expectedOutput: []byte(`{"meta": {"type": "domains", "version": 4, "count": 1}, "data": [{"domain": "example.com"`), - // expectedError: ErrInvalidJSON, - // }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - src := bytes.NewReader(tt.input) - dst := &bytes.Buffer{} - - err := WriteAndValidateJSON(src, dst) - if tt.expectedError != nil { - assert.Error(t, err) - assert.ErrorIs(t, err, tt.expectedError) - } else { - assert.NoError(t, err) - } - assert.Equal(t, tt.expectedOutput, dst.Bytes()) - }) - } -} - -func TestWriteAndValidateJSON_NormalizationError(t *testing.T) { - src := &ErrorReader{err: errors.New("read error")} - dst := &bytes.Buffer{} - - err := WriteAndValidateJSON(src, dst) - - assert.Error(t, err) - assert.Equal(t, "read error", err.Error()) -} - -// ErrorReader is a mock reader that always returns an error -type ErrorReader struct { - err error -} - -func (er *ErrorReader) Read(p []byte) (n int, err error) { - return 0, er.err -} diff --git a/cmd/api/src/services/fileupload/validation.go b/cmd/api/src/services/fileupload/validation.go index d035c64517..3055f5a075 100644 --- a/cmd/api/src/services/fileupload/validation.go +++ b/cmd/api/src/services/fileupload/validation.go @@ -19,10 +19,9 @@ package fileupload import ( "encoding/json" "errors" - "io" - "github.com/specterops/bloodhound/log" "github.com/specterops/bloodhound/src/model/ingest" + "io" ) var ZipMagicBytes = []byte{0x50, 0x4b, 0x03, 0x04} @@ -53,7 +52,7 @@ func ValidateMetaTag(reader io.Reader, readToEnd bool) (ingest.Metadata, error) return ingest.Metadata{}, ErrInvalidJSON } } else { - // Validate that our data tag is actually opening correctly + //Validate that our data tag is actually opening correctly if dataTagFound && !dataTagValidated { if typed, ok := token.(json.Delim); ok && typed == ingest.DelimOpenSquareBracket { dataTagValidated = true diff --git a/go.work b/go.work index 58c40df495..c5077a96c0 100644 --- a/go.work +++ b/go.work @@ -19,7 +19,6 @@ go 1.21 use ( ./cmd/api/src ./packages/go/analysis - ./packages/go/bomenc ./packages/go/cache ./packages/go/conftool ./packages/go/crypto diff --git a/packages/go/bomenc/encodings.go b/packages/go/bomenc/encodings.go deleted file mode 100644 index 590d40c9bc..0000000000 --- a/packages/go/bomenc/encodings.go +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright 2024 Specter Ops, Inc. -// -// Licensed under the Apache License, Version 2.0 -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// SPDX-License-Identifier: Apache-2.0 - -package bomenc - -// Encoding interface defines the methods that all encoding types must implement. -// This interface provides a unified way to handle different encodings throughout the package, -// allowing us to treat all encodings uniformly. This design facilitates easy extension -// and manipulation of different encoding types without altering the core logic. -type Encoding interface { - // Sequence returns the byte sequence that represents the Byte Order Mark (BOM) for this encoding. - // This method is crucial for identifying the specific byte sequence that indicates - // this encoding at the start of a file. - Sequence() []byte - - // String returns a human-readable string representation of the encoding. - // This is particularly useful for logging and user interfaces, providing - // a user-friendly name for the encoding. - String() string - - // HasSequence checks if the given data starts with this encoding's BOM sequence. - // This method allows for efficient checking of whether a given byte slice - // begins with this encoding's BOM, which is essential for encoding detection. - HasSequence(data []byte) bool -} - -// bomEncoding is the concrete implementation of the Encoding interface. -// It encapsulates all necessary information and behavior for a specific encoding, -// providing a consistent structure for handling different encodings. This approach -// allows us to create instances for each supported encoding while maintaining -// a uniform interface for interaction. -type bomEncoding struct { - encodingType string // A human-readable name for the encoding - sequence []byte // The BOM sequence for this encoding - hasSequenceFunc func(data []byte) bool // Function to check if data starts with this encoding's BOM -} - -// String returns the human-readable name of the encoding. -// This method fulfills the Encoding interface and provides a simple way -// to get a string representation of the encoding. -func (s bomEncoding) String() string { - return s.encodingType -} - -// Sequence returns the BOM sequence for this encoding. -// This method fulfills the Encoding interface and provides access to the BOM sequence, -// which is essential for encoding detection and writing files with proper BOMs. -func (s bomEncoding) Sequence() []byte { - return s.sequence -} - -// HasSequence checks if the given data starts with this encoding's BOM sequence. -// This method fulfills the Encoding interface and provides a way to check for -// the presence of this encoding's BOM, which is crucial for encoding detection. -func (s bomEncoding) HasSequence(data []byte) bool { - return s.hasSequenceFunc(data) -} - -// The following functions are used to check for specific encoding BOMs. -// By defining these as separate functions, we can easily reuse them -// and potentially extend them if more complex checking is needed in the future. -// This approach also keeps the bomEncoding struct clean and simple. - -func isUTF32BE(buf []byte) bool { - return len(buf) >= 4 && buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF -} - -func isUTF32LE(buf []byte) bool { - return len(buf) >= 4 && buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00 -} - -func isUTF8(buf []byte) bool { - return len(buf) >= 3 && buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF -} - -func isUTF16BE(buf []byte) bool { - return len(buf) >= 2 && buf[0] == 0xFE && buf[1] == 0xFF -} - -func isUTF16LE(buf []byte) bool { - return len(buf) >= 2 && buf[0] == 0xFF && buf[1] == 0xFE -} - -// The following variables define the supported encodings. -// By defining these as package-level variables, we allow easy reference -// throughout the package and by users of the package. This design also -// facilitates potential future extension by simply adding new encoding variables. - -// Unknown represents an unknown or unrecognized encoding. -// Having an Unknown encoding allows us to handle cases where -// the encoding cannot be determined, providing a fallback option. -var Unknown Encoding = bomEncoding{ - encodingType: "Unknown", - sequence: nil, // Unknown encoding has no BOM sequence - hasSequenceFunc: func(data []byte) bool { return false }, -} - -// UTF8 represents the UTF-8 encoding. -var UTF8 Encoding = bomEncoding{ - encodingType: "UTF-8", - sequence: []byte{0xEF, 0xBB, 0xBF}, // UTF-8 BOM sequence - hasSequenceFunc: isUTF8, -} - -// UTF16BE represents the UTF-16 Big Endian encoding. -var UTF16BE Encoding = bomEncoding{ - encodingType: "UTF-16 BE", - sequence: []byte{0xFE, 0xFF}, // UTF-16 BE BOM sequence - hasSequenceFunc: isUTF16BE, -} - -// UTF16LE represents the UTF-16 Little Endian encoding. -var UTF16LE Encoding = bomEncoding{ - encodingType: "UTF-16 LE", - sequence: []byte{0xFF, 0xFE}, // UTF-16 LE BOM sequence - hasSequenceFunc: isUTF16LE, -} - -// UTF32BE represents the UTF-32 Big Endian encoding. -var UTF32BE Encoding = bomEncoding{ - encodingType: "UTF-32 BE", - sequence: []byte{0x00, 0x00, 0xFE, 0xFF}, // UTF-32 BE BOM sequence - hasSequenceFunc: isUTF32BE, -} - -// UTF32LE represents the UTF-32 Little Endian encoding. -var UTF32LE Encoding = bomEncoding{ - encodingType: "UTF-32 LE", - sequence: []byte{0xFF, 0xFE, 0x00, 0x00}, // UTF-32 LE BOM sequence - hasSequenceFunc: isUTF32LE, -} diff --git a/packages/go/bomenc/encodings_test.go b/packages/go/bomenc/encodings_test.go deleted file mode 100644 index c7ebf3f3ce..0000000000 --- a/packages/go/bomenc/encodings_test.go +++ /dev/null @@ -1,221 +0,0 @@ -// Copyright 2024 Specter Ops, Inc. -// -// Licensed under the Apache License, Version 2.0 -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// SPDX-License-Identifier: Apache-2.0 - -package bomenc - -import ( - "testing" - - "github.com/stretchr/testify/assert" -) - -func TestEncodingInterface(t *testing.T) { - encodings := []struct { - name string - encoding Encoding - }{ - {name: "Unknown", encoding: Unknown}, - {name: "UTF8", encoding: UTF8}, - {name: "UTF16BE", encoding: UTF16BE}, - {name: "UTF16LE", encoding: UTF16LE}, - {name: "UTF32BE", encoding: UTF32BE}, - {name: "UTF32LE", encoding: UTF32LE}, - } - - for _, tt := range encodings { - t.Run(tt.name, func(t *testing.T) { - assert.NotEmpty(t, tt.encoding.String(), "Encoding String() should not be empty") - if tt.encoding.String() != Unknown.String() { - assert.NotEmpty(t, tt.encoding.Sequence(), "Encoding Sequence() should not be empty for non-Unknown encodings") - } - // Test HasSequence method - if tt.encoding.String() != Unknown.String() { - assert.True(t, tt.encoding.HasSequence(tt.encoding.Sequence()), "HasSequence() should return true for its own sequence") - } - }) - } -} - -func TestEncodingValues(t *testing.T) { - tests := []struct { - name string - encoding Encoding - expectedType string - expectedSeq []byte - }{ - { - name: "Unknown", - encoding: Unknown, - expectedType: "Unknown", - expectedSeq: nil, - }, - { - name: "UTF-8", - encoding: UTF8, - expectedType: "UTF-8", - expectedSeq: []byte{0xEF, 0xBB, 0xBF}, - }, - { - name: "UTF-16 BE", - encoding: UTF16BE, - expectedType: "UTF-16 BE", - expectedSeq: []byte{0xFE, 0xFF}, - }, - { - name: "UTF-16 LE", - encoding: UTF16LE, - expectedType: "UTF-16 LE", - expectedSeq: []byte{0xFF, 0xFE}, - }, - { - name: "UTF-32 BE", - encoding: UTF32BE, - expectedType: "UTF-32 BE", - expectedSeq: []byte{0x00, 0x00, 0xFE, 0xFF}, - }, - { - name: "UTF-32 LE", - encoding: UTF32LE, - expectedType: "UTF-32 LE", - expectedSeq: []byte{0xFF, 0xFE, 0x00, 0x00}, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - assert.Equal(t, tt.expectedType, tt.encoding.String(), "Encoding type should match") - assert.Equal(t, tt.expectedSeq, tt.encoding.Sequence(), "Encoding sequence should match") - if tt.encoding.String() != Unknown.String() { - assert.True(t, tt.encoding.HasSequence(tt.expectedSeq), "HasSequence() should return true for the expected sequence") - } - }) - } -} - -func TestBOMEncoding(t *testing.T) { - testCases := []struct { - name string - encoding bomEncoding - expectedString string - expectedSeq []byte - testData []byte - hasSequence bool - }{ - { - name: "Custom encoding", - encoding: bomEncoding{ - encodingType: "Custom", - sequence: []byte{0x01, 0x02, 0x03}, - hasSequenceFunc: func(data []byte) bool { - return len(data) >= 3 && data[0] == 0x01 && data[1] == 0x02 && data[2] == 0x03 - }, - }, - expectedString: "Custom", - expectedSeq: []byte{0x01, 0x02, 0x03}, - testData: []byte{0x01, 0x02, 0x03, 0x04}, - hasSequence: true, - }, - { - name: "Empty encoding", - encoding: bomEncoding{ - encodingType: "", - sequence: []byte{}, - hasSequenceFunc: func(data []byte) bool { return len(data) == 0 }, - }, - expectedString: "", - expectedSeq: []byte{}, - testData: []byte{0x01, 0x02, 0x03}, - hasSequence: false, - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - assert.Equal(t, tc.expectedString, tc.encoding.String(), "bomEncoding String() should return correct value") - assert.Equal(t, tc.expectedSeq, tc.encoding.Sequence(), "bomEncoding Sequence() should return correct value") - assert.Equal(t, tc.hasSequence, tc.encoding.HasSequence(tc.testData), "bomEncoding HasSequence() should return correct value") - }) - } -} - -func TestEncodingEquality(t *testing.T) { - testCases := []struct { - name string - enc1 Encoding - enc2 Encoding - expected bool - }{ - { - name: "Same encoding", - enc1: UTF8, - enc2: UTF8, - expected: true, - }, - { - name: "Different encodings", - enc1: UTF8, - enc2: UTF16BE, - expected: false, - }, - { - name: "Unknown and other encoding", - enc1: Unknown, - enc2: UTF8, - expected: false, - }, - { - name: "Both Unknown", - enc1: Unknown, - enc2: Unknown, - expected: true, - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - assert.Equal(t, tc.expected, tc.enc1.String() == tc.enc2.String(), "Encoding equality check should be correct") - }) - } -} - -func TestHasSequence(t *testing.T) { - testCases := []struct { - name string - encoding Encoding - input []byte - expected bool - }{ - {"UTF-8 with correct BOM", UTF8, []byte{0xEF, 0xBB, 0xBF, 0x68, 0x65, 0x6C, 0x6C, 0x6F}, true}, - {"UTF-8 without BOM", UTF8, []byte{0x68, 0x65, 0x6C, 0x6C, 0x6F}, false}, - {"UTF-16BE with correct BOM", UTF16BE, []byte{0xFE, 0xFF, 0x00, 0x68, 0x00, 0x65}, true}, - {"UTF-16BE without BOM", UTF16BE, []byte{0x00, 0x68, 0x00, 0x65}, false}, - {"UTF-16LE with correct BOM", UTF16LE, []byte{0xFF, 0xFE, 0x68, 0x00, 0x65, 0x00}, true}, - {"UTF-16LE without BOM", UTF16LE, []byte{0x68, 0x00, 0x65, 0x00}, false}, - {"UTF-32BE with correct BOM", UTF32BE, []byte{0x00, 0x00, 0xFE, 0xFF, 0x00, 0x00, 0x00, 0x68}, true}, - {"UTF-32BE without BOM", UTF32BE, []byte{0x00, 0x00, 0x00, 0x68}, false}, - {"UTF-32LE with correct BOM", UTF32LE, []byte{0xFF, 0xFE, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00}, true}, - {"UTF-32LE without BOM", UTF32LE, []byte{0x68, 0x00, 0x00, 0x00}, false}, - {"Unknown encoding", Unknown, []byte{0x68, 0x65, 0x6C, 0x6C, 0x6F}, false}, - {"Empty input", UTF8, []byte{}, false}, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - result := tc.encoding.HasSequence(tc.input) - assert.Equal(t, tc.expected, result, "HasSequence() should correctly identify BOM presence") - }) - } -} diff --git a/packages/go/bomenc/go.mod b/packages/go/bomenc/go.mod deleted file mode 100644 index de1b999620..0000000000 --- a/packages/go/bomenc/go.mod +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2024 Specter Ops, Inc. -// -// Licensed under the Apache License, Version 2.0 -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// SPDX-License-Identifier: Apache-2.0 - -module github.com/specterops/bloodhound/bomenc - -go 1.21 - -require github.com/stretchr/testify v1.8.4 - -require ( - github.com/davecgh/go-spew v1.1.1 // indirect - github.com/kr/pretty v0.3.1 // indirect - github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/rogpeppe/go-internal v1.10.0 // indirect - gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect -) diff --git a/packages/go/bomenc/go.sum b/packages/go/bomenc/go.sum deleted file mode 100644 index d219feaa3c..0000000000 --- a/packages/go/bomenc/go.sum +++ /dev/null @@ -1,8 +0,0 @@ -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= -github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= -github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= -github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= -github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= -gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= -gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= diff --git a/packages/go/bomenc/normalize.go b/packages/go/bomenc/normalize.go deleted file mode 100644 index b816b84a3b..0000000000 --- a/packages/go/bomenc/normalize.go +++ /dev/null @@ -1,152 +0,0 @@ -// Copyright 2024 Specter Ops, Inc. -// -// Licensed under the Apache License, Version 2.0 -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// SPDX-License-Identifier: Apache-2.0 - -package bomenc - -import ( - "bytes" - "errors" - "io" - "unicode/utf8" -) - -// Normalizer interface defines the methods for accessing normalized content. -// This allows consumers of the package to work with normalized data without -// knowing the specifics of how the normalization was performed. -type Normalizer interface { - // NormalizedContent returns the normalized (UTF-8) content. - NormalizedContent() []byte - // NormalizedFrom returns the original encoding of the content before normalization. - NormalizedFrom() Encoding -} - -// normalizer is the concrete implementation of the Normalizer interface. -type normalizer struct { - normalizedContent []byte // The normalized (UTF-8) content - normalizedFrom Encoding // The original encoding before normalization -} - -// NormalizedContent returns the normalized (UTF-8) content. -func (s normalizer) NormalizedContent() []byte { - return s.normalizedContent -} - -// NormalizedFrom returns the original encoding of the content before normalization. -func (s normalizer) NormalizedFrom() Encoding { - return s.normalizedFrom -} - -// DetectBOMEncoding detects the byte order mark in the given bytes and returns the corresponding Encoding. -// This function is crucial for determining the encoding of incoming data based on its BOM. -func DetectBOMEncoding(data []byte) (Encoding, error) { - // Check for UTF-8 BOM - if len(data) >= 3 && bytes.Equal(data[:3], UTF8.Sequence()) { - return UTF8, nil - } - - if len(data) >= 2 { - // Check for UTF-16BE BOM - if bytes.Equal(data[:2], UTF16BE.Sequence()) { - return UTF16BE, nil - } - // Check for potential UTF-16LE or UTF-32LE BOM - if bytes.Equal(data[:2], UTF16LE.Sequence()) { - // We need to differentiate between UTF-16LE and UTF-32LE - if len(data) >= 8 { - isLikelyUTF16LE := data[2] != 0 || data[3] != 0 || data[4] != 0 || data[5] != 0 - isLikelyUTF32LE := data[2] == 0 && data[3] == 0 && data[6] == 0 && data[7] == 0 - switch { - case isLikelyUTF32LE && isLikelyUTF16LE: - return UTF32LE, nil - case isLikelyUTF16LE && !isLikelyUTF32LE: - return UTF16LE, nil - case isLikelyUTF32LE && !isLikelyUTF16LE: - return UTF32BE, nil - default: - return UTF16LE, nil - } - } - // If we can't determine definitively, default to Unknown - return Unknown, nil - } - } - - // Check for UTF-32BE BOM - if len(data) >= 4 && bytes.Equal(data[:4], UTF32BE.Sequence()) { - return UTF32BE, nil - } - - // If no BOM is detected, return Unknown - return Unknown, nil -} - -// NormalizeToUTF8 converts the input to UTF-8, removing any BOM. -// This function is the main entry point for normalizing data from an io.Reader. -// It's useful when working with streams of data, such as file input. -func NormalizeToUTF8(input io.Reader) (Normalizer, error) { - data, err := io.ReadAll(input) - if err != nil { - return nil, err - } - - detectedBOMEncoding, err := DetectBOMEncoding(data) - if err != nil { - return nil, err - } - - return NormalizeBytesToUTF8(data, detectedBOMEncoding) -} - -// ErrUnknownEncodingInvalidUTF8 ... -var ErrUnknownEncodingInvalidUTF8 = errors.New("unknown encoding and not a valid UTF-8") - -// NormalizeBytesToUTF8 converts the given bytes to UTF-8 based on the specified encoding. -// This function is the core of the normalization process, handling different encodings -// and converting them to UTF-8. -func NormalizeBytesToUTF8(data []byte, enc Encoding) (Normalizer, error) { - var ( - content []byte - err error - ) - - switch enc.String() { - case UTF8.String(): - content = data[min(len(enc.Sequence()), len(data)):] - case UTF16BE.String(): - content, err = utf16ToUTF8(data[min(len(enc.Sequence()), len(data)):], true) - case UTF16LE.String(): - content, err = utf16ToUTF8(data[min(len(enc.Sequence()), len(data)):], false) - case UTF32BE.String(): - content, err = utf32ToUTF8(data[min(len(enc.Sequence()), len(data)):], true) - case UTF32LE.String(): - content, err = utf32ToUTF8(data[min(len(enc.Sequence()), len(data)):], false) - case Unknown.String(): - if utf8.Valid(data) { - content = data - } else { - return nil, ErrUnknownEncodingInvalidUTF8 - } - } - - if err != nil { - return nil, err - } - - return normalizer{ - normalizedContent: content, - normalizedFrom: enc, - }, nil -} diff --git a/packages/go/bomenc/normalize_test.go b/packages/go/bomenc/normalize_test.go deleted file mode 100644 index 9e38123edd..0000000000 --- a/packages/go/bomenc/normalize_test.go +++ /dev/null @@ -1,347 +0,0 @@ -// Copyright 2024 Specter Ops, Inc. -// -// Licensed under the Apache License, Version 2.0 -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// SPDX-License-Identifier: Apache-2.0 - -package bomenc - -import ( - "bytes" - "errors" - "testing" - "unicode/utf16" - "unicode/utf8" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestDetectBOMEncoding(t *testing.T) { - tests := []struct { - name string - input []byte - expected Encoding - }{ - { - name: "UTF-8 BOM", - input: []byte{0xEF, 0xBB, 0xBF, 0x68, 0x65, 0x6C, 0x6C, 0x6F}, - expected: UTF8, - }, - { - name: "UTF-16BE BOM", - input: []byte{0xFE, 0xFF, 0x00, 0x68, 0x00, 0x65, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0x6F}, - expected: UTF16BE, - }, - { - name: "UTF-16LE BOM", - input: []byte{0xFF, 0xFE, 0x68, 0x00, 0x65, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0x6F, 0x00}, - expected: UTF16LE, - }, - { - name: "UTF-32BE BOM", - input: []byte{0x00, 0x00, 0xFE, 0xFF, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x65}, - expected: UTF32BE, - }, - { - name: "UTF-32LE BOM", - input: []byte{0xFF, 0xFE, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00}, - expected: UTF32LE, - }, - { - name: "No BOM", - input: []byte{0x68, 0x65, 0x6C, 0x6C, 0x6F}, - expected: Unknown, - }, - { - name: "Empty input", - input: []byte{}, - expected: Unknown, - }, - { - name: "Incomplete UTF-16LE BOM (should not be detected as UTF-16LE)", - input: []byte{0xFF, 0xFE, 0x68}, - expected: Unknown, - }, - { - name: "Incomplete UTF-32LE BOM (should not be detected as UTF-32LE)", - input: []byte{0xFF, 0xFE, 0x00}, - expected: Unknown, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result, err := DetectBOMEncoding(tt.input) - require.NoError(t, err) - assert.Equal(t, tt.expected.String(), result.String(), "DetectBOMEncoding() should return the correct encoding") - }) - } -} - -func TestNormalizeToUTF8(t *testing.T) { - tests := []struct { - name string - input []byte - expected []byte - encFrom Encoding - wantErr bool - }{ - { - name: "UTF-8 BOM", - input: []byte{0xEF, 0xBB, 0xBF, 0x68, 0x65, 0x6C, 0x6C, 0x6F}, - expected: []byte("hello"), - encFrom: UTF8, - wantErr: false, - }, - { - name: "UTF-16BE BOM", - input: []byte{0xFE, 0xFF, 0x00, 0x68, 0x00, 0x65, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0x6F}, - expected: []byte("hello"), - encFrom: UTF16BE, - wantErr: false, - }, - { - name: "UTF-16LE BOM", - input: []byte{0xFF, 0xFE, 0x68, 0x00, 0x65, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0x6F, 0x00}, - expected: []byte("hello"), - encFrom: UTF16LE, - wantErr: false, - }, - { - name: "UTF-32BE BOM", - input: []byte{0x00, 0x00, 0xFE, 0xFF, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x6F}, - expected: []byte("hello"), - encFrom: UTF32BE, - wantErr: false, - }, - { - name: "UTF-32LE BOM", - input: []byte{0xFF, 0xFE, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00}, - expected: []byte("hello"), - encFrom: UTF32LE, - wantErr: false, - }, - { - name: "No BOM (valid UTF-8)", - input: []byte("hello"), - expected: []byte("hello"), - encFrom: Unknown, - wantErr: false, - }, - { - name: "No BOM (invalid UTF-8)", - input: []byte{0xFF, 0xFE, 0xFD}, - expected: nil, - encFrom: Unknown, - wantErr: true, - }, - { - name: "Empty input", - input: []byte{}, - expected: []byte{}, - encFrom: Unknown, - wantErr: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - reader := bytes.NewReader(tt.input) - result, err := NormalizeToUTF8(reader) - - if tt.wantErr { - assert.Error(t, err, "NormalizeToUTF8() should return an error for invalid input") - return - } - - require.NoError(t, err, "NormalizeToUTF8() should not return an error for valid input") - assert.Equal(t, tt.expected, result.NormalizedContent(), "NormalizedContent() should return the correct normalized content") - assert.Equal(t, tt.encFrom.String(), result.NormalizedFrom().String(), "NormalizedFrom() should return the correct original encoding") - }) - } -} - -func TestNormalizeBytesToUTF8(t *testing.T) { - tests := []struct { - name string - input []byte - enc Encoding - expected []byte - wantErr bool - }{ - { - name: "UTF-8 BOM", - input: []byte{0xEF, 0xBB, 0xBF, 0x68, 0x65, 0x6C, 0x6C, 0x6F}, - enc: UTF8, - expected: []byte("hello"), - wantErr: false, - }, - { - name: "UTF-16BE BOM", - input: []byte{0xFE, 0xFF, 0x00, 0x68, 0x00, 0x65, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0x6F}, - enc: UTF16BE, - expected: []byte("hello"), - wantErr: false, - }, - { - name: "UTF-16LE BOM", - input: []byte{0xFF, 0xFE, 0x68, 0x00, 0x65, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0x6F, 0x00}, - enc: UTF16LE, - expected: []byte("hello"), - wantErr: false, - }, - { - name: "UTF-32BE BOM", - input: []byte{0x00, 0x00, 0xFE, 0xFF, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x6F}, - enc: UTF32BE, - expected: []byte("hello"), - wantErr: false, - }, - { - name: "UTF-32LE BOM", - input: []byte{0xFF, 0xFE, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00}, - enc: UTF32LE, - expected: []byte("hello"), - wantErr: false, - }, - { - name: "No BOM (valid UTF-8)", - input: []byte("hello"), - enc: Unknown, - expected: []byte("hello"), - wantErr: false, - }, - { - name: "No BOM (invalid UTF-8)", - input: []byte{0xFF, 0xFE, 0xFD}, - enc: Unknown, - expected: nil, - wantErr: true, - }, - { - name: "Empty input", - input: []byte{}, - enc: Unknown, - expected: []byte{}, - wantErr: false, - }, - { - name: "Invalid UTF-16", - input: []byte{0xFE, 0xFF, 0x00}, - enc: UTF16BE, - expected: nil, - wantErr: true, - }, - { - name: "Invalid UTF-32", - input: []byte{0x00, 0x00, 0xFE, 0xFF, 0x00, 0x00, 0x00}, - enc: UTF32BE, - expected: nil, - wantErr: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result, err := NormalizeBytesToUTF8(tt.input, tt.enc) - - if tt.wantErr { - assert.Error(t, err, "NormalizeBytesToUTF8() should return an error for invalid input") - return - } - - require.NoError(t, err, "NormalizeBytesToUTF8() should not return an error for valid input") - assert.Equal(t, tt.expected, result.NormalizedContent(), "NormalizedContent() should return the correct normalized content") - assert.Equal(t, tt.enc.String(), result.NormalizedFrom().String(), "NormalizedFrom() should return the correct original encoding") - }) - } -} - -// Mock reader for testing error cases -type errorReader struct{} - -func (er errorReader) Read(p []byte) (n int, err error) { - return 0, errors.New("mock read error") -} - -func TestNormalizeToUTF8_ReaderError(t *testing.T) { - _, err := NormalizeToUTF8(errorReader{}) - assert.Error(t, err, "NormalizeToUTF8() should return an error when the reader fails") -} - -func TestNormalizeToUTF8_LargeInput(t *testing.T) { - type testCase struct { - name string - input []byte - expected []byte - encFrom Encoding - } - - // Generate a large input with 1000 Unicode code points - var utf16LE, expected []byte - - for i := 0; i < 1000; i++ { - r := rune(i % 0x10FFFF) // Use all possible Unicode code points - - // UTF-16 - u16 := utf16.Encode([]rune{r}) - for _, c := range u16 { - utf16LE = append(utf16LE, byte(c), byte(c>>8)) - } - - // Expected UTF-8 - buf := make([]byte, 4) - n := utf8.EncodeRune(buf, r) - expected = append(expected, buf[:n]...) - } - - // Add BOM - utf16LE = append([]byte{0xFF, 0xFE}, utf16LE...) - - tests := []testCase{ - { - name: "Large UTF-16LE input", - input: utf16LE, - expected: expected, - encFrom: UTF16LE, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - reader := bytes.NewReader(tt.input) - result, err := NormalizeToUTF8(reader) - if err != nil { - t.Errorf("NormalizeToUTF8() error = %v", err) - // Print the first few bytes of the input for debugging - t.Logf("First 20 bytes of input: %v", tt.input[:20]) - // Print the detected encoding - detectedEnc, err := DetectBOMEncoding(tt.input) - assert.NoError(t, err) - t.Logf("Detected encoding: %v", detectedEnc) - return - } - - assert.Equal(t, tt.encFrom.String(), result.NormalizedFrom().String(), "NormalizedFrom() should return the correct original encoding") - - if !bytes.Equal(tt.expected, result.NormalizedContent()) { - t.Errorf("NormalizedContent() = %v, want %v", result.NormalizedContent(), tt.expected) - // Print the first few bytes of the result and expected for debugging - t.Logf("First 20 bytes of result: %v", result.NormalizedContent()[:20]) - t.Logf("First 20 bytes of expected: %v", tt.expected[:20]) - - } - }) - } -} diff --git a/packages/go/bomenc/utf16.go b/packages/go/bomenc/utf16.go deleted file mode 100644 index 6bf9f1f68f..0000000000 --- a/packages/go/bomenc/utf16.go +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright 2024 Specter Ops, Inc. -// -// Licensed under the Apache License, Version 2.0 -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// SPDX-License-Identifier: Apache-2.0 - -package bomenc - -import ( - "bytes" - "errors" -) - -// utf16ToUTF8 converts a UTF-16 encoded byte slice to UTF-8. -// It handles both big-endian and little-endian encodings, as well as surrogate pairs. -// -// Parameters: -// - data: A byte slice containing UTF-16 encoded data -// - bigEndian: A boolean indicating whether the input is big-endian (true) or little-endian (false) -// -// Returns: -// - A byte slice containing the UTF-8 encoded result -// - An error if the conversion fails (e.g., due to incomplete input) -// -// Advantages of using bitwise operations for this conversion: -// -// 1. Efficiency: Bitwise operations are extremely fast at the CPU level. -// They typically execute in a single clock cycle, making them more -// efficient than arithmetic operations or function calls. -// -// 2. Direct memory manipulation: Bitwise operations allow us to work -// directly with the binary representation of the data, which is -// crucial when dealing with different byte-order encodings. -// -// 3. Preserving data integrity: By using bitwise operations, we ensure -// that we're interpreting the bytes exactly as they are, without any -// unintended modifications that could occur with higher-level operations. -// -// 4. Endianness handling: Bitwise operations make it easy to handle both -// big-endian and little-endian encodings with minimal code duplication. -// -// 5. Performance in loops: When processing large amounts of text, the -// performance benefits of bitwise operations become significant due -// to the number of times these operations are repeated. -// -// 6. Low-level control: Bitwise operations provide fine-grained control -// over individual bits, which is necessary for correct interpretation -// of multi-byte character encodings. -func utf16ToUTF8(data []byte, bigEndian bool) ([]byte, error) { - var buf bytes.Buffer - var r rune - - for i := 0; i < len(data); i += 2 { - if i+1 >= len(data) { - return nil, errors.New("incomplete UTF-16 sequence") - } - - var codeUnit uint16 - if bigEndian { - // Big-endian: first byte is more significant - // Shift the first byte left by 8 bits and OR it with the second byte - // Example: - // data[i] = 0x12 (00010010 in binary) - // data[i+1] = 0x34 (00110100 in binary) - // result = 0x1234 (0001001000110100 in binary) - codeUnit = uint16(data[i])<<8 | uint16(data[i+1]) - } else { - // Little-endian: second byte is more significant - // Shift the second byte left by 8 bits and OR it with the first byte - // Example: - // data[i] = 0x34 (00110100 in binary) - // data[i+1] = 0x12 (00010010 in binary) - // result = 0x1234 (0001001000110100 in binary) - codeUnit = uint16(data[i+1])<<8 | uint16(data[i]) - } - - if codeUnit >= 0xD800 && codeUnit <= 0xDBFF { - if i+3 >= len(data) { - buf.WriteRune(0xFFFD) - break - } - - var lowSurrogate uint16 - if bigEndian { - lowSurrogate = uint16(data[i+2])<<8 | uint16(data[i+3]) - } else { - lowSurrogate = uint16(data[i+3])<<8 | uint16(data[i+2]) - } - - if lowSurrogate >= 0xDC00 && lowSurrogate <= 0xDFFF { - // Combine high and low surrogates into a single code point - // 1. Subtract 0xD800 from the high surrogate to get the high 10 bits - // 2. Subtract 0xDC00 from the low surrogate to get the low 10 bits - // 3. Shift the high 10 bits left by 10 positions - // 4. OR the result with the low 10 bits - // 5. Add 0x10000 to get the final code point - // Example: - // codeUnit = 0xD801 (1101100000000001 in binary) - // lowSurrogate = 0xDC37 (1101110000110111 in binary) - // Step 1: 0xD801 - 0xD800 = 0x0001 - // Step 2: 0xDC37 - 0xDC00 = 0x0037 - // Step 3: 0x0001 << 10 = 0x0400 - // Step 4: 0x0400 | 0x0037 = 0x0437 - // Step 5: 0x0437 + 0x10000 = 0x10437 (Code point U+10437) - r = (rune(codeUnit-0xD800)<<10 | rune(lowSurrogate-0xDC00)) + 0x10000 - i += 2 - } else { - r = 0xFFFD - } - } else if codeUnit >= 0xDC00 && codeUnit <= 0xDFFF { - r = 0xFFFD - } else { - r = rune(codeUnit) - } - - buf.WriteRune(r) - } - - return buf.Bytes(), nil -} diff --git a/packages/go/bomenc/utf16_test.go b/packages/go/bomenc/utf16_test.go deleted file mode 100644 index 869ef7ca32..0000000000 --- a/packages/go/bomenc/utf16_test.go +++ /dev/null @@ -1,216 +0,0 @@ -// Copyright 2024 Specter Ops, Inc. -// -// Licensed under the Apache License, Version 2.0 -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// SPDX-License-Identifier: Apache-2.0 - -package bomenc - -import ( - "testing" - "unicode/utf16" - "unicode/utf8" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestUTF16ToUTF8(t *testing.T) { - tests := []struct { - name string - input []byte - bigEndian bool - expected []byte - wantErr bool - }{ - { - name: "UTF-16BE Basic ASCII", - input: []byte{0x00, 0x68, 0x00, 0x65, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0x6F}, - bigEndian: true, - expected: []byte("hello"), - wantErr: false, - }, - { - name: "UTF-16LE Basic ASCII", - input: []byte{0x68, 0x00, 0x65, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0x6F, 0x00}, - bigEndian: false, - expected: []byte("hello"), - wantErr: false, - }, - { - name: "UTF-16BE with BMP characters", - input: []byte{0x00, 0x41, 0x26, 0x3A, 0x00, 0x42}, - bigEndian: true, - expected: []byte("A☺B"), - wantErr: false, - }, - { - name: "UTF-16LE with BMP characters", - input: []byte{0x41, 0x00, 0x3A, 0x26, 0x42, 0x00}, - bigEndian: false, - expected: []byte("A☺B"), - wantErr: false, - }, - { - name: "UTF-16BE with surrogate pair", - input: []byte{0xD8, 0x3D, 0xDE, 0x00}, - bigEndian: true, - expected: []byte{0xF0, 0x9F, 0x98, 0x80}, // U+1F600 - 😀 GRINNING FACE emoji - wantErr: false, - }, - { - name: "UTF-16LE with surrogate pair (GRINNING FACE emoji)", - input: []byte{0x3D, 0xD8, 0x00, 0xDE}, - bigEndian: false, - expected: []byte{0xF0, 0x9F, 0x98, 0x80}, // U+1F600 - 😀 GRINNING FACE emoji - wantErr: false, - }, - { - name: "UTF-16BE with mixed characters", - input: []byte{0x00, 0x48, 0x00, 0x69, 0xD8, 0x3D, 0xDE, 0x00, 0x00, 0x21}, - bigEndian: true, - expected: []byte{0x48, 0x69, 0xF0, 0x9F, 0x98, 0x80, 0x21}, // "Hi😀!" (GRINNING FACE emoji) - wantErr: false, - }, - { - name: "UTF-16LE with mixed characters (GRINNING FACE emoji)", - input: []byte{0x48, 0x00, 0x69, 0x00, 0x3D, 0xD8, 0x00, 0xDE, 0x21, 0x00}, - bigEndian: false, - expected: []byte{0x48, 0x69, 0xF0, 0x9F, 0x98, 0x80, 0x21}, // "Hi😀!" (GRINNING FACE emoji) - wantErr: false, - }, - { - name: "Incomplete UTF-16BE sequence", - input: []byte{0x00, 0x68, 0x00}, - bigEndian: true, - expected: nil, - wantErr: true, - }, - { - name: "Incomplete UTF-16LE sequence", - input: []byte{0x68, 0x00, 0x65}, - bigEndian: false, - expected: nil, - wantErr: true, - }, - { - name: "Empty input", - input: []byte{}, - bigEndian: true, - expected: nil, - wantErr: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result, err := utf16ToUTF8(tt.input, tt.bigEndian) - - if tt.wantErr { - assert.Error(t, err, "utf16ToUTF8() should return an error for invalid input") - return - } - - require.NoError(t, err, "utf16ToUTF8() should not return an error for valid input") - assert.Equal(t, tt.expected, result, "utf16ToUTF8() should return the correct UTF-8 bytes") - }) - } -} - -func TestUTF16ToUTF8_LargeInput(t *testing.T) { - // Generate a large input with 1000 Unicode code points - var largeInputBE, largeInputLE []byte - var expected []byte - - for i := 0; i < 1000; i++ { - r := rune(i % 0x10FFFF) // Use all possible Unicode code points - - utf16Sequence := utf16.Encode([]rune{r}) - for _, u16 := range utf16Sequence { - // Big Endian - largeInputBE = append(largeInputBE, byte(u16>>8), byte(u16)) - // Little Endian - largeInputLE = append(largeInputLE, byte(u16), byte(u16>>8)) - } - - // Append UTF-8 encoded rune to expected result - buf := make([]byte, 4) - n := utf8.EncodeRune(buf, r) - expected = append(expected, buf[:n]...) - } - - t.Run("Large UTF-16BE input", func(t *testing.T) { - result, err := utf16ToUTF8(largeInputBE, true) - require.NoError(t, err, "utf16ToUTF8() should not return an error for valid large input") - assert.Equal(t, expected, result, "utf16ToUTF8() should correctly convert large UTF-16BE input") - }) - - t.Run("Large UTF-16LE input", func(t *testing.T) { - result, err := utf16ToUTF8(largeInputLE, false) - require.NoError(t, err, "utf16ToUTF8() should not return an error for valid large input") - assert.Equal(t, expected, result, "utf16ToUTF8() should correctly convert large UTF-16LE input") - }) -} - -func TestUTF16ToUTF8_SurrogateEdgeCases(t *testing.T) { - tests := []struct { - name string - input []byte - bigEndian bool - expected []byte - wantErr bool - }{ - { - name: "UTF-16BE High surrogate without low surrogate", - input: []byte{0xD8, 0x00, 0x00, 0x41}, - bigEndian: true, - expected: []byte{0xEF, 0xBF, 0xBD, 0x41}, // Replacement character followed by 'A' - wantErr: false, - }, - { - name: "UTF-16LE High surrogate without low surrogate", - input: []byte{0x00, 0xD8, 0x41, 0x00}, - bigEndian: false, - expected: []byte{0xEF, 0xBF, 0xBD, 0x41}, // Replacement character followed by 'A' - wantErr: false, - }, - { - name: "UTF-16BE Low surrogate without high surrogate", - input: []byte{0xDC, 0x00, 0x00, 0x41}, - bigEndian: true, - expected: []byte{0xEF, 0xBF, 0xBD, 0x41}, // Replacement character followed by 'A' - wantErr: false, - }, - { - name: "UTF-16LE Low surrogate without high surrogate", - input: []byte{0x00, 0xDC, 0x41, 0x00}, - bigEndian: false, - expected: []byte{0xEF, 0xBF, 0xBD, 0x41}, // Replacement character followed by 'A' - wantErr: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result, err := utf16ToUTF8(tt.input, tt.bigEndian) - - if tt.wantErr { - assert.Error(t, err, "utf16ToUTF8() should return an error for invalid input") - return - } - - require.NoError(t, err, "utf16ToUTF8() should not return an error for valid input") - assert.Equal(t, tt.expected, result, "utf16ToUTF8() should return the correct UTF-8 bytes") - }) - } -} diff --git a/packages/go/bomenc/utf32.go b/packages/go/bomenc/utf32.go deleted file mode 100644 index 8bfdfeaef5..0000000000 --- a/packages/go/bomenc/utf32.go +++ /dev/null @@ -1,133 +0,0 @@ -// Copyright 2024 Specter Ops, Inc. -// -// Licensed under the Apache License, Version 2.0 -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// SPDX-License-Identifier: Apache-2.0 - -package bomenc - -import ( - "bytes" - "errors" -) - -// utf32ToUTF8 converts UTF-32 encoded bytes to UTF-8. -// -// Advantages of using bitwise operations for this conversion: -// -// 1. Efficiency: Bitwise operations are highly optimized at the hardware level, -// often executing in a single CPU cycle. This makes them faster than -// arithmetic operations or function calls, especially important when -// processing large volumes of text. -// -// 2. Direct byte manipulation: Bitwise operations allow us to work directly -// with the binary representation of the data. This is crucial for correctly -// interpreting UTF-32 encoded characters, which span four bytes. -// -// 3. Precision: When dealing with character encodings, every bit matters. -// Bitwise operations ensure we're interpreting the data exactly as intended, -// without any unintended modifications that could occur with higher-level operations. -// -// 4. Endianness handling: UTF-32 can be either big-endian or little-endian. -// Bitwise operations provide a clean and efficient way to handle both -// encodings with minimal code duplication. -// -// 5. Memory efficiency: By manipulating bits directly, we avoid the need -// for intermediate data structures or type conversions, which can be -// beneficial for memory usage, especially when processing large files. -// -// 6. Portability: Bitwise operations behave consistently across different -// hardware architectures, ensuring our code works reliably on various systems. -// -// 7. Educational value: Understanding and using bitwise operations provides -// insights into low-level data representation, which is valuable knowledge -// for any programmer working with different character encodings or -// binary protocols. -func utf32ToUTF8(data []byte, bigEndian bool) ([]byte, error) { - var buf bytes.Buffer - - for i := 0; i < len(data); i += 4 { - if i+3 >= len(data) { - return nil, errors.New("incomplete UTF-32 sequence") - } - - var r rune - if bigEndian { - // Big Endian UTF-32 to rune conversion - // In big endian, the most significant byte comes first - - // Step 1: Convert the first byte to a uint32 and shift it left by 24 bits - // This operation moves the bits of the first byte to the highest-order position - // Example: if data[i] is 0x12, after shift it becomes 0x12000000 - firstByte := uint32(data[i]) << 24 - - // Step 2: Convert the second byte to a uint32 and shift it left by 16 bits - // This operation moves the bits of the second byte to the second highest-order position - // Example: if data[i+1] is 0x34, after shift it becomes 0x00340000 - secondByte := uint32(data[i+1]) << 16 - - // Step 3: Convert the third byte to a uint32 and shift it left by 8 bits - // This operation moves the bits of the third byte to the second lowest-order position - // Example: if data[i+2] is 0x56, after shift it becomes 0x00005600 - thirdByte := uint32(data[i+2]) << 8 - - // Step 4: Convert the fourth byte to a uint32 - // The fourth byte remains in the lowest-order position - // Example: if data[i+3] is 0x78, it remains 0x00000078 - fourthByte := uint32(data[i+3]) - - // Step 5: Combine all four bytes using bitwise OR - // This operation merges the four bytes into a single 32-bit value - // Example: 0x12000000 | 0x00340000 | 0x00005600 | 0x00000078 = 0x12345678 - r = rune(firstByte | secondByte | thirdByte | fourthByte) - - // The resulting rune r now contains the 32-bit UTF-32 code point - } else { - // Little Endian UTF-32 to rune conversion - // In little endian, the least significant byte comes first - - // Step 1: Convert the fourth byte to a uint32 and shift it left by 24 bits - // This operation moves the bits of the fourth byte to the highest-order position - // Example: if data[i+3] is 0x12, after shift it becomes 0x12000000 - fourthByte := uint32(data[i+3]) << 24 - - // Step 2: Convert the third byte to a uint32 and shift it left by 16 bits - // This operation moves the bits of the third byte to the second highest-order position - // Example: if data[i+2] is 0x34, after shift it becomes 0x00340000 - thirdByte := uint32(data[i+2]) << 16 - - // Step 3: Convert the second byte to a uint32 and shift it left by 8 bits - // This operation moves the bits of the second byte to the second lowest-order position - // Example: if data[i+1] is 0x56, after shift it becomes 0x00005600 - secondByte := uint32(data[i+1]) << 8 - - // Step 4: Convert the first byte to a uint32 - // The first byte remains in the lowest-order position - // Example: if data[i] is 0x78, it remains 0x00000078 - firstByte := uint32(data[i]) - - // Step 5: Combine all four bytes using bitwise OR - // This operation merges the four bytes into a single 32-bit value - // Example: 0x12000000 | 0x00340000 | 0x00005600 | 0x00000078 = 0x12345678 - r = rune(fourthByte | thirdByte | secondByte | firstByte) - - // The resulting rune r now contains the 32-bit UTF-32 code point - } - - // Write the rune to the buffer - // The WriteRune method automatically handles the conversion from the rune to UTF-8 - buf.WriteRune(r) - } - - return buf.Bytes(), nil -} diff --git a/packages/go/bomenc/utf32_test.go b/packages/go/bomenc/utf32_test.go deleted file mode 100644 index 5ae843c6e2..0000000000 --- a/packages/go/bomenc/utf32_test.go +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright 2024 Specter Ops, Inc. -// -// Licensed under the Apache License, Version 2.0 -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// SPDX-License-Identifier: Apache-2.0 - -package bomenc - -import ( - "testing" - "unicode/utf8" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestUTF32ToUTF8(t *testing.T) { - tests := []struct { - name string - input []byte - bigEndian bool - expected []byte - wantErr bool - }{ - { - name: "UTF-32BE Basic ASCII", - input: []byte{0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x6F}, - bigEndian: true, - expected: []byte("hello"), - wantErr: false, - }, - { - name: "UTF-32LE Basic ASCII", - input: []byte{0x68, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00}, - bigEndian: false, - expected: []byte("hello"), - wantErr: false, - }, - { - name: "UTF-32BE with BMP characters", - input: []byte{0x00, 0x00, 0x00, 0x41, 0x00, 0x00, 0x26, 0x3A, 0x00, 0x00, 0x00, 0x42}, - bigEndian: true, - expected: []byte("A☺B"), - wantErr: false, - }, - { - name: "UTF-32LE with BMP characters", - input: []byte{0x41, 0x00, 0x00, 0x00, 0x3A, 0x26, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00}, - bigEndian: false, - expected: []byte("A☺B"), - wantErr: false, - }, - { - name: "UTF-32BE with non-BMP character", - input: []byte{0x00, 0x01, 0xF4, 0x00}, - bigEndian: true, - expected: []byte("🐀"), - wantErr: false, - }, - { - name: "UTF-32LE with non-BMP character", - input: []byte{0x00, 0xF4, 0x01, 0x00}, - bigEndian: false, - expected: []byte("🐀"), - wantErr: false, - }, - { - name: "Incomplete UTF-32BE sequence", - input: []byte{0x00, 0x00, 0x00}, - bigEndian: true, - expected: nil, - wantErr: true, - }, - { - name: "Incomplete UTF-32LE sequence", - input: []byte{0x00, 0x00, 0x00}, - bigEndian: false, - expected: nil, - wantErr: true, - }, - { - name: "Empty input", - input: []byte{}, - bigEndian: true, - expected: nil, - wantErr: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result, err := utf32ToUTF8(tt.input, tt.bigEndian) - - if tt.wantErr { - assert.Error(t, err, "utf32ToUTF8() should return an error for invalid input") - return - } - - require.NoError(t, err, "utf32ToUTF8() should not return an error for valid input") - assert.Equal(t, tt.expected, result, "utf32ToUTF8() should return the correct UTF-8 bytes") - }) - } -} - -func TestUTF32ToUTF8_LargeInput(t *testing.T) { - // Generate a large input with 1000 characters - largeInputBE := make([]byte, 4000) - largeInputLE := make([]byte, 4000) - var expected []byte - - for i := 0; i < 1000; i++ { - codePoint := rune(i % 0x10FFFF) // Use all possible Unicode code points - - // Big Endian - largeInputBE[i*4] = byte(codePoint >> 24) - largeInputBE[i*4+1] = byte(codePoint >> 16) - largeInputBE[i*4+2] = byte(codePoint >> 8) - largeInputBE[i*4+3] = byte(codePoint) - - // Little Endian - largeInputLE[i*4] = byte(codePoint) - largeInputLE[i*4+1] = byte(codePoint >> 8) - largeInputLE[i*4+2] = byte(codePoint >> 16) - largeInputLE[i*4+3] = byte(codePoint >> 24) - - // Append UTF-8 encoded rune to expected result - buf := make([]byte, 4) - n := utf8.EncodeRune(buf, codePoint) - expected = append(expected, buf[:n]...) - } - - t.Run("Large UTF-32BE input", func(t *testing.T) { - result, err := utf32ToUTF8(largeInputBE, true) - require.NoError(t, err, "utf32ToUTF8() should not return an error for valid large input") - assert.Equal(t, expected, result, "utf32ToUTF8() should correctly convert large UTF-32BE input") - }) - - t.Run("Large UTF-32LE input", func(t *testing.T) { - result, err := utf32ToUTF8(largeInputLE, false) - require.NoError(t, err, "utf32ToUTF8() should not return an error for valid large input") - assert.Equal(t, expected, result, "utf32ToUTF8() should correctly convert large UTF-32LE input") - }) -}