From 1f96d1af84c45f32007da0105b8eab514ec3f7d3 Mon Sep 17 00:00:00 2001 From: Ivan Bella <347158+ivakegg@users.noreply.github.com> Date: Mon, 25 Nov 2024 14:50:29 +0000 Subject: [PATCH 01/42] git subrepo clone git@github.com:NationalSecurityAgency/datawave-type-utils.git core/utils/type-utils subrepo: subdir: "core/utils/type-utils" merged: "55d92d5d99" upstream: origin: "git@github.com:NationalSecurityAgency/datawave-type-utils.git" branch: "main" commit: "55d92d5d99" git-subrepo: version: "0.4.9" origin: "https://github.com/ingydotnet/git-subrepo" commit: "cce3d93" --- .gitmodules | 3 - core/utils/type-utils | 1 - core/utils/type-utils/.github/CODEOWNERS | 3 + .../type-utils/.github/workflows/settings.xml | 23 + .../type-utils/.github/workflows/tests.yaml | 17 + core/utils/type-utils/.gitignore | 9 + core/utils/type-utils/.gitrepo | 12 + core/utils/type-utils/LICENSE | 203 +++ core/utils/type-utils/README.md | 8 + core/utils/type-utils/pom.xml | 262 ++++ .../AbstractGeometryNormalizer.java | 239 ++++ .../data/normalizer/AbstractNormalizer.java | 17 + .../data/normalizer/DateNormalizer.java | 187 +++ .../normalizer/DiscreteIndexNormalizer.java | 54 + .../data/normalizer/GeoLatNormalizer.java | 57 + .../data/normalizer/GeoLonNormalizer.java | 58 + .../data/normalizer/GeoNormalizer.java | 533 +++++++ .../data/normalizer/GeometryNormalizer.java | 88 ++ .../data/normalizer/HexStringNormalizer.java | 74 + .../data/normalizer/IpAddressNormalizer.java | 56 + .../normalizer/LcNoDiacriticsNormalizer.java | 74 + .../data/normalizer/LcNormalizer.java | 50 + .../data/normalizer/MacAddressNormalizer.java | 116 ++ .../data/normalizer/NetworkNormalizer.java | 46 + .../data/normalizer/NoOpNormalizer.java | 27 + .../normalizer/NormalizationException.java | 25 + .../datawave/data/normalizer/Normalizer.java | 42 + .../data/normalizer/NumberNormalizer.java | 66 + .../data/normalizer/OneToManyNormalizer.java | 10 + .../data/normalizer/PointNormalizer.java | 72 + .../data/normalizer/RawDateNormalizer.java | 34 + .../TrimLeadingZerosNormalizer.java | 34 + .../data/normalizer/ZeroRegexStatus.java | 5 + .../normalizer/regex/AlternationNode.java | 38 + .../data/normalizer/regex/AnyCharNode.java | 32 + .../data/normalizer/regex/CharClassNode.java | 48 + .../data/normalizer/regex/CharRangeNode.java | 59 + .../normalizer/regex/DigitCharClassNode.java | 34 + .../data/normalizer/regex/EmptyNode.java | 32 + .../normalizer/regex/EncodedNumberNode.java | 37 + .../normalizer/regex/EncodedPatternNode.java | 41 + .../data/normalizer/regex/EndAnchorNode.java | 32 + .../regex/EscapedSingleCharNode.java | 46 + .../data/normalizer/regex/ExpressionNode.java | 41 + .../data/normalizer/regex/GroupNode.java | 41 + .../data/normalizer/regex/IntegerNode.java | 48 + .../normalizer/regex/IntegerRangeNode.java | 67 + .../datawave/data/normalizer/regex/Node.java | 333 +++++ .../normalizer/regex/NodeListIterator.java | 192 +++ .../data/normalizer/regex/NodeType.java | 25 + .../normalizer/regex/NumericRegexEncoder.java | 462 +++++++ .../data/normalizer/regex/OneOrMoreNode.java | 32 + .../normalizer/regex/QuestionMarkNode.java | 32 + .../data/normalizer/regex/RegexConstants.java | 65 + .../data/normalizer/regex/RegexParser.java | 305 ++++ .../data/normalizer/regex/RegexReader.java | 246 ++++ .../data/normalizer/regex/RegexUtils.java | 639 +++++++++ .../data/normalizer/regex/RepetitionNode.java | 36 + .../data/normalizer/regex/SingleCharNode.java | 46 + .../normalizer/regex/StartAnchorNode.java | 34 + .../data/normalizer/regex/ZeroOrMoreNode.java | 32 + .../regex/visitor/AlternationDeduper.java | 57 + .../regex/visitor/AnchorTrimmer.java | 29 + .../normalizer/regex/visitor/BaseVisitor.java | 148 ++ .../normalizer/regex/visitor/BinFinder.java | 210 +++ .../normalizer/regex/visitor/CopyVisitor.java | 161 +++ .../regex/visitor/DecimalPointPlacer.java | 561 ++++++++ .../regex/visitor/DecimalPointValidator.java | 58 + .../regex/visitor/EmptyLeafTrimmer.java | 79 ++ .../regex/visitor/EqualityVisitor.java | 169 +++ .../regex/visitor/ExponentialBinAdder.java | 154 +++ .../regex/visitor/GTEOneBinFinder.java | 143 ++ .../regex/visitor/LTOneBinFinder.java | 143 ++ .../NegativeNumberPatternInverter.java | 567 ++++++++ .../visitor/NegativeVariantExpander.java | 64 + .../visitor/NonEncodedNumbersChecker.java | 68 + .../visitor/NumericCharClassValidator.java | 61 + .../visitor/OptionalVariantExpander.java | 167 +++ .../regex/visitor/PrintVisitor.java | 223 +++ .../regex/visitor/SimpleNumberEncoder.java | 90 ++ .../regex/visitor/StringVisitor.java | 192 +++ .../regex/visitor/SubExpressionVisitor.java | 98 ++ .../normalizer/regex/visitor/Visitor.java | 65 + .../visitor/ZeroLengthRepetitionTrimmer.java | 97 ++ .../normalizer/regex/visitor/ZeroTrimmer.java | 722 ++++++++++ .../regex/visitor/ZeroValueNormalizer.java | 157 +++ .../datawave/data/parser/GeometryParser.java | 22 + .../java/datawave/data/parser/WKBParser.java | 34 + .../java/datawave/data/parser/WKTParser.java | 39 + .../data/type/AbstractGeometryType.java | 99 ++ .../java/datawave/data/type/BaseType.java | 175 +++ .../java/datawave/data/type/DateType.java | 36 + .../datawave/data/type/DiscreteIndexType.java | 54 + .../java/datawave/data/type/GeoLatType.java | 23 + .../java/datawave/data/type/GeoLonType.java | 23 + .../main/java/datawave/data/type/GeoType.java | 23 + .../java/datawave/data/type/GeometryType.java | 43 + .../datawave/data/type/HexStringType.java | 23 + .../java/datawave/data/type/HitTermType.java | 3 + .../datawave/data/type/IpAddressType.java | 45 + .../datawave/data/type/IpV4AddressType.java | 36 + .../data/type/LcNoDiacriticsListType.java | 15 + .../data/type/LcNoDiacriticsType.java | 27 + .../main/java/datawave/data/type/LcType.java | 27 + .../java/datawave/data/type/ListType.java | 51 + .../datawave/data/type/MacAddressType.java | 23 + .../java/datawave/data/type/NoOpType.java | 29 + .../datawave/data/type/NumberListType.java | 15 + .../java/datawave/data/type/NumberType.java | 27 + .../data/type/OneToManyNormalizerType.java | 12 + .../java/datawave/data/type/PointType.java | 14 + .../java/datawave/data/type/RawDateType.java | 28 + .../java/datawave/data/type/StringType.java | 23 + .../data/type/TrimLeadingZerosType.java | 23 + .../main/java/datawave/data/type/Type.java | 56 + .../java/datawave/data/type/TypeFactory.java | 72 + .../data/type/util/AbstractGeometry.java | 25 + .../datawave/data/type/util/Geometry.java | 17 + .../datawave/data/type/util/IpAddress.java | 44 + .../datawave/data/type/util/IpV4Address.java | 373 +++++ .../datawave/data/type/util/IpV6Address.java | 262 ++++ .../datawave/data/type/util/MACAddress.java | 265 ++++ .../data/type/util/NumericalEncoder.java | 206 +++ .../datawave/data/type/util/PhoneNumber.java | 465 +++++++ .../java/datawave/data/type/util/Point.java | 17 + .../query/parser/JavaRegexAnalyzer.java | 1228 +++++++++++++++++ .../webservice/query/data/ObjectSizeOf.java | 220 +++ .../query/util/OptionallyEncodedString.java | 119 ++ .../util/OptionallyEncodedStringAdapter.java | 51 + .../util/QueryUncaughtExceptionHandler.java | 28 + .../webservice/query/util/TypedValue.java | 520 +++++++ .../query/util/TypedValueAdapter.java | 16 + .../webservice/query/util/XMLUtil.java | 27 + .../datawave.data.parser.GeometryParser | 1 + .../webservice/query/util/package-info.java | 7 + .../type-utils/src/main/spotbugs/excludes.xml | 12 + .../data/normalizer/DateNormalizerTest.java | 183 +++ .../normalizer/GeometryNormalizerTest.java | 131 ++ .../normalizer/HexStringNormalizerTest.java | 60 + .../normalizer/IpAddressNormalizerTest.java | 124 ++ .../LcNoDiacriticsNormalizerTest.java | 17 + .../NormalizationExceptionTest.java | 60 + .../data/normalizer/NumberNormalizerTest.java | 264 ++++ .../data/normalizer/PointNormalizerTest.java | 159 +++ .../data/normalizer/regex/NodeAssert.java | 337 +++++ .../regex/NumericRegexEncoderTest.java | 607 ++++++++ .../normalizer/regex/RegexParserTest.java | 306 ++++ .../data/normalizer/regex/RegexUtilsTest.java | 191 +++ .../regex/visitor/AlternationDeduperTest.java | 42 + .../regex/visitor/AnchorTrimmerTest.java | 52 + .../regex/visitor/DecimalPointPlacerTest.java | 426 ++++++ .../visitor/DecimalPointValidatorTest.java | 64 + .../regex/visitor/EmptyLeafTrimmerTest.java | 74 + .../visitor/ExponentialBinAdderTest.java | 445 ++++++ .../NegativeNumberPatternInverterTest.java | 125 ++ .../visitor/NegativeVariantExpanderTest.java | 66 + .../visitor/NonEncodedNumbersCheckerTest.java | 48 + .../NumericCharClassValidatorTest.java | 84 ++ .../visitor/OptionalVariantExpanderTest.java | 83 ++ .../visitor/SimpleNumberEncoderTest.java | 78 ++ .../regex/visitor/StringVisitorTest.java | 44 + .../ZeroLengthRepetitionTrimmerTest.java | 64 + .../regex/visitor/ZeroTrimmerTest.java | 362 +++++ .../visitor/ZeroValueNormalizerTest.java | 85 ++ .../datawave/data/parser/WKBParserTest.java | 52 + .../data/type/GeometryObjectSizeTest.java | 110 ++ .../datawave/data/type/IpAddressTypeTest.java | 98 ++ .../data/type/LcNoDiacriticsTypeTest.java | 21 + .../java/datawave/data/type/ListTypeTest.java | 53 + .../datawave/data/type/TypeFactoryTest.java | 116 ++ .../data/type/util/IpV6AddressTypeTest.java | 56 + .../data/type/util/NumericalEncoderTest.java | 87 ++ .../query/parser/JavaRegexAnalyzerTest.java | 1169 ++++++++++++++++ .../query/data/ObjectSizeOfTest.java | 115 ++ .../webservice/query/util/TypedValueTest.java | 216 +++ .../webservice/query/util/XMLUtilTest.java | 26 + .../datawave/data/normalizer/geoRanges.txt | 1 + .../datawave/data/normalizer/pointRanges.txt | 1 + .../query/util/TypedValueExpectedEncoded.xml | 1 + .../util/TypedValueExpectedUnencoded.xml | 1 + .../src/test/resources/log4j.properties | 6 + 181 files changed, 21792 insertions(+), 4 deletions(-) delete mode 160000 core/utils/type-utils create mode 100644 core/utils/type-utils/.github/CODEOWNERS create mode 100644 core/utils/type-utils/.github/workflows/settings.xml create mode 100644 core/utils/type-utils/.github/workflows/tests.yaml create mode 100644 core/utils/type-utils/.gitignore create mode 100644 core/utils/type-utils/.gitrepo create mode 100644 core/utils/type-utils/LICENSE create mode 100644 core/utils/type-utils/README.md create mode 100644 core/utils/type-utils/pom.xml create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/AbstractGeometryNormalizer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/AbstractNormalizer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/DateNormalizer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/DiscreteIndexNormalizer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/GeoLatNormalizer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/GeoLonNormalizer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/GeoNormalizer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/GeometryNormalizer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/HexStringNormalizer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/IpAddressNormalizer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/LcNoDiacriticsNormalizer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/LcNormalizer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/MacAddressNormalizer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/NetworkNormalizer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/NoOpNormalizer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/NormalizationException.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/Normalizer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/NumberNormalizer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/OneToManyNormalizer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/PointNormalizer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/RawDateNormalizer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/TrimLeadingZerosNormalizer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/ZeroRegexStatus.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/AlternationNode.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/AnyCharNode.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/CharClassNode.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/CharRangeNode.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/DigitCharClassNode.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EmptyNode.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EncodedNumberNode.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EncodedPatternNode.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EndAnchorNode.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EscapedSingleCharNode.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/ExpressionNode.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/GroupNode.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/IntegerNode.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/IntegerRangeNode.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/Node.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/NodeListIterator.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/NodeType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/NumericRegexEncoder.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/OneOrMoreNode.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/QuestionMarkNode.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexConstants.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexParser.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexReader.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexUtils.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RepetitionNode.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/SingleCharNode.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/StartAnchorNode.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/ZeroOrMoreNode.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/AlternationDeduper.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/AnchorTrimmer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/BaseVisitor.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/BinFinder.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/CopyVisitor.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/DecimalPointPlacer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/DecimalPointValidator.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/EmptyLeafTrimmer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/EqualityVisitor.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ExponentialBinAdder.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/GTEOneBinFinder.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/LTOneBinFinder.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NegativeNumberPatternInverter.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NegativeVariantExpander.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NonEncodedNumbersChecker.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NumericCharClassValidator.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/OptionalVariantExpander.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/PrintVisitor.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/SimpleNumberEncoder.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/StringVisitor.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/SubExpressionVisitor.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/Visitor.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ZeroLengthRepetitionTrimmer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ZeroTrimmer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ZeroValueNormalizer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/parser/GeometryParser.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/parser/WKBParser.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/parser/WKTParser.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/AbstractGeometryType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/BaseType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/DateType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/DiscreteIndexType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/GeoLatType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/GeoLonType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/GeoType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/GeometryType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/HexStringType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/HitTermType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/IpAddressType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/IpV4AddressType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/LcNoDiacriticsListType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/LcNoDiacriticsType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/LcType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/ListType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/MacAddressType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/NoOpType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/NumberListType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/NumberType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/OneToManyNormalizerType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/PointType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/RawDateType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/StringType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/TrimLeadingZerosType.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/Type.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/TypeFactory.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/util/AbstractGeometry.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/util/Geometry.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/util/IpAddress.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/util/IpV4Address.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/util/IpV6Address.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/util/MACAddress.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/util/NumericalEncoder.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/util/PhoneNumber.java create mode 100644 core/utils/type-utils/src/main/java/datawave/data/type/util/Point.java create mode 100644 core/utils/type-utils/src/main/java/datawave/query/parser/JavaRegexAnalyzer.java create mode 100644 core/utils/type-utils/src/main/java/datawave/webservice/query/data/ObjectSizeOf.java create mode 100644 core/utils/type-utils/src/main/java/datawave/webservice/query/util/OptionallyEncodedString.java create mode 100644 core/utils/type-utils/src/main/java/datawave/webservice/query/util/OptionallyEncodedStringAdapter.java create mode 100644 core/utils/type-utils/src/main/java/datawave/webservice/query/util/QueryUncaughtExceptionHandler.java create mode 100644 core/utils/type-utils/src/main/java/datawave/webservice/query/util/TypedValue.java create mode 100644 core/utils/type-utils/src/main/java/datawave/webservice/query/util/TypedValueAdapter.java create mode 100644 core/utils/type-utils/src/main/java/datawave/webservice/query/util/XMLUtil.java create mode 100644 core/utils/type-utils/src/main/resources/META-INF/services/datawave.data.parser.GeometryParser create mode 100644 core/utils/type-utils/src/main/resources/source-templates/datawave/webservice/query/util/package-info.java create mode 100644 core/utils/type-utils/src/main/spotbugs/excludes.xml create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/DateNormalizerTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/GeometryNormalizerTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/HexStringNormalizerTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/IpAddressNormalizerTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/LcNoDiacriticsNormalizerTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/NormalizationExceptionTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/NumberNormalizerTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/PointNormalizerTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/NodeAssert.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/NumericRegexEncoderTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/RegexParserTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/RegexUtilsTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/AlternationDeduperTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/AnchorTrimmerTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/DecimalPointPlacerTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/DecimalPointValidatorTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/EmptyLeafTrimmerTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/ExponentialBinAdderTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/NegativeNumberPatternInverterTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/NegativeVariantExpanderTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/NonEncodedNumbersCheckerTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/NumericCharClassValidatorTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/OptionalVariantExpanderTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/SimpleNumberEncoderTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/StringVisitorTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/ZeroLengthRepetitionTrimmerTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/ZeroTrimmerTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/ZeroValueNormalizerTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/parser/WKBParserTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/type/GeometryObjectSizeTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/type/IpAddressTypeTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/type/LcNoDiacriticsTypeTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/type/ListTypeTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/type/TypeFactoryTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/type/util/IpV6AddressTypeTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/data/type/util/NumericalEncoderTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/query/parser/JavaRegexAnalyzerTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/webservice/query/data/ObjectSizeOfTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/webservice/query/util/TypedValueTest.java create mode 100644 core/utils/type-utils/src/test/java/datawave/webservice/query/util/XMLUtilTest.java create mode 100644 core/utils/type-utils/src/test/resources/datawave/data/normalizer/geoRanges.txt create mode 100644 core/utils/type-utils/src/test/resources/datawave/data/normalizer/pointRanges.txt create mode 100644 core/utils/type-utils/src/test/resources/datawave/webservice/query/util/TypedValueExpectedEncoded.xml create mode 100644 core/utils/type-utils/src/test/resources/datawave/webservice/query/util/TypedValueExpectedUnencoded.xml create mode 100644 core/utils/type-utils/src/test/resources/log4j.properties diff --git a/.gitmodules b/.gitmodules index 26ad0ff918c..60d8ea4a4a8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -13,9 +13,6 @@ [submodule "core/base-rest-responses"] path = core/base-rest-responses url = git@github.com:NationalSecurityAgency/datawave-base-rest-responses.git -[submodule "core/utils/type-utils"] - path = core/utils/type-utils - url = git@github.com:NationalSecurityAgency/datawave-type-utils.git [submodule "core/utils/metadata-utils"] path = core/utils/metadata-utils url = git@github.com:NationalSecurityAgency/datawave-metadata-utils.git diff --git a/core/utils/type-utils b/core/utils/type-utils deleted file mode 160000 index 55d92d5d99c..00000000000 --- a/core/utils/type-utils +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 55d92d5d99c6e232ba1d7ad12c210ded9ec240a9 diff --git a/core/utils/type-utils/.github/CODEOWNERS b/core/utils/type-utils/.github/CODEOWNERS new file mode 100644 index 00000000000..72450894a4f --- /dev/null +++ b/core/utils/type-utils/.github/CODEOWNERS @@ -0,0 +1,3 @@ +# In order to ensure the query microservices continue to work, changes to this repo +# must be tightly controlled. +* @jwomeara @ivakegg @hlgp \ No newline at end of file diff --git a/core/utils/type-utils/.github/workflows/settings.xml b/core/utils/type-utils/.github/workflows/settings.xml new file mode 100644 index 00000000000..d8be2eb498d --- /dev/null +++ b/core/utils/type-utils/.github/workflows/settings.xml @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + github-datawave + ${env.USER_NAME} + ${env.ACCESS_TOKEN} + + + diff --git a/core/utils/type-utils/.github/workflows/tests.yaml b/core/utils/type-utils/.github/workflows/tests.yaml new file mode 100644 index 00000000000..d714b4deee8 --- /dev/null +++ b/core/utils/type-utils/.github/workflows/tests.yaml @@ -0,0 +1,17 @@ +name: Tests + +on: + push: + paths-ignore: ['*.md', 'CODEOWNERS', 'LICENSE'] + branches: + - 'main' + - 'release/*' + pull_request: + paths-ignore: ['*.md', 'CODEOWNERS', 'LICENSE'] + +jobs: + call-reusable-workflow: + uses: nationalsecurityagency/datawave/.github/workflows/microservice-maven-tests.yaml@integration + secrets: + USER_NAME: ${{ secrets.USER_NAME }} + ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }} \ No newline at end of file diff --git a/core/utils/type-utils/.gitignore b/core/utils/type-utils/.gitignore new file mode 100644 index 00000000000..6e170178597 --- /dev/null +++ b/core/utils/type-utils/.gitignore @@ -0,0 +1,9 @@ +target/ + +.idea/ +*.iml +*.iws + +.classpath +.project +.settings/ diff --git a/core/utils/type-utils/.gitrepo b/core/utils/type-utils/.gitrepo new file mode 100644 index 00000000000..d48dd88e1ac --- /dev/null +++ b/core/utils/type-utils/.gitrepo @@ -0,0 +1,12 @@ +; DO NOT EDIT (unless you know what you are doing) +; +; This subdirectory is a git "subrepo", and this file is maintained by the +; git-subrepo command. See https://github.com/ingydotnet/git-subrepo#readme +; +[subrepo] + remote = git@github.com:NationalSecurityAgency/datawave-type-utils.git + branch = main + commit = 55d92d5d99c6e232ba1d7ad12c210ded9ec240a9 + parent = 492dcb6b2af08d4ce8e43c8a9a323b40abdfb062 + method = merge + cmdver = 0.4.9 diff --git a/core/utils/type-utils/LICENSE b/core/utils/type-utils/LICENSE new file mode 100644 index 00000000000..6b0b1270ff0 --- /dev/null +++ b/core/utils/type-utils/LICENSE @@ -0,0 +1,203 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/core/utils/type-utils/README.md b/core/utils/type-utils/README.md new file mode 100644 index 00000000000..db9a8992985 --- /dev/null +++ b/core/utils/type-utils/README.md @@ -0,0 +1,8 @@ +# Type Utils + +[![Apache License][li]][ll] ![Build Status](https://github.com/NationalSecurityAgency/datawave-type-utils/workflows/Tests/badge.svg) + +Type Utils are DATAWAVE data type objects and utilities for working with them. + +[li]: http://img.shields.io/badge/license-ASL-blue.svg +[ll]: https://www.apache.org/licenses/LICENSE-2.0 \ No newline at end of file diff --git a/core/utils/type-utils/pom.xml b/core/utils/type-utils/pom.xml new file mode 100644 index 00000000000..51e2254c577 --- /dev/null +++ b/core/utils/type-utils/pom.xml @@ -0,0 +1,262 @@ + + + 4.0.0 + + gov.nsa.datawave.microservice + datawave-microservice-parent + 4.0.0 + ../../../microservices/microservice-parent/pom.xml + + type-utils + 3.1.2-SNAPSHOT + https://code.nsa.gov/datawave-type-utils + + + The Apache License, Version 2.0 + https://www.apache.org/licenses/LICENSE-2.0.txt + + + + scm:git:https://github.com/NationalSecurityAgency/datawave-type-utils.git + scm:git:git@github.com:NationalSecurityAgency/datawave-type-utils.git + HEAD + https://github.com/NationalSecurityAgency/datawave-type-utils + + + http://webservice.datawave.nsa/v1 + ${project.basedir}/src/main/spotbugs/excludes.xml + 3.20.2 + 3.0.0 + 3.12.0 + 3.6 + 1.2.0 + 2.3.3 + 1.19.0 + 1.6.2 + 1.7.29 + + + + + commons-net + commons-net + ${version.commons-net} + + + gov.nsa.datawave.microservice + common-utils + ${version.common-utils} + + + io.protostuff + protostuff-collectionschema + ${version.protostuff} + + + io.protostuff + protostuff-core + ${version.protostuff} + + + org.apache.commons + commons-lang3 + ${version.commons-lang3} + + + org.assertj + assertj-core + ${version.assertj} + + + org.locationtech.geowave + geowave-core-geotime + ${version.geowave} + + + hadoop-client + org.apache.hadoop + + + accumulo-minicluster + org.apache.accumulo + + + commons-lang3 + org.apache.commons + + + * + org.eclipse.emf + + + bcprov-jdk15on + org.bouncycastle + + + json-lib + net.sf.json-lib + + + log4j + log4j + + + + + org.locationtech.jts + jts-core + ${version.jts} + + + org.slf4j + slf4j-api + ${version.slf4j} + + + com.sun.xml.bind + jaxb-impl + ${version.jaxb} + provided + + + + + + com.sun.xml.bind + jaxb-impl + + + commons-net + commons-net + + + gov.nsa.datawave.microservice + common-utils + + + io.protostuff + protostuff-collectionschema + + + io.protostuff + protostuff-core + + + org.apache.commons + commons-lang3 + + + org.locationtech.geowave + geowave-core-geotime + + + org.locationtech.jts + jts-core + + + org.slf4j + slf4j-api + + + org.assertj + assertj-core + test + + + org.junit.jupiter + junit-jupiter-engine + test + + + xerces + xercesImpl + 2.12.2 + test + + + + + + true + + + false + + github-datawave + https://maven.pkg.github.com/NationalSecurityAgency/datawave + + + + false + + osgeo-release + Open Source Geospatial Foundation Repository + https://repo.osgeo.org/repository/release/ + + + + false + + cloudera + https://repository.cloudera.com/artifactory/cloudera-repos/ + + + geosolutions + GeoSolutions Repository + https://maven.geo-solutions.it + + + + + + true + src/main/resources + + source-templates/** + + + + + + maven-resources-plugin + + + copy-templated-sources + validate + + copy-resources + + + ${project.build.directory}/generated-sources/templated-sources + + + src/main/resources/source-templates + true + + + + + + + + org.codehaus.mojo + build-helper-maven-plugin + 3.3.0 + + + add-source + generate-sources + + add-source + + + + target/generated-sources/templated-sources + + + + + + + + diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/AbstractGeometryNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/AbstractGeometryNormalizer.java new file mode 100644 index 00000000000..665afd7c88e --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/AbstractGeometryNormalizer.java @@ -0,0 +1,239 @@ +package datawave.data.normalizer; + +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.ServiceLoader; +import java.util.TreeSet; + +import org.apache.commons.codec.binary.Hex; +import org.locationtech.geowave.core.geotime.util.GeometryUtils; +import org.locationtech.geowave.core.index.NumericIndexStrategy; +import org.locationtech.geowave.core.index.sfc.data.MultiDimensionalNumericData; +import org.locationtech.geowave.core.store.api.Index; +import org.locationtech.jts.geom.Geometry; + +import datawave.data.parser.GeometryParser; + +/** + * A normalizer that, given a parseable geometry string representing an arbitrary geometry, will perform GeoWave indexing with a spatial geowave index + * configuration + * + */ +public abstract class AbstractGeometryNormalizer + implements Normalizer, DiscreteIndexNormalizer { + private static final long serialVersionUID = 171360806347433135L; + + protected static final int LONGITUDE_BITS = 31; + protected static final int LATITUDE_BITS = 31; + + private static TreeSet geoParsers = new TreeSet<>(); + + static { + ServiceLoader geoParserLoader = ServiceLoader.load(GeometryParser.class, GeometryNormalizer.class.getClassLoader()); + for (GeometryParser geoParser : geoParserLoader) + geoParsers.add(geoParser); + } + + // NOTE: If we change the index strategy, then we will need to update the validHash method appropriately. + abstract public NumericIndexStrategy getIndexStrategy(); + + abstract public Index getIndex(); + + abstract protected T createDatawaveGeometry(G geometry); + + /** + * Expects to receive a parseable geometry string. The default geometry parser accepts Open Geospatial Consortium compliant Well-Known test strings An + * example for points is of the form: + * + * POINT ([number][space][number]) + */ + @Override + public String normalize(String geoString) throws IllegalArgumentException { + if (validHash(geoString)) { + return geoString; + } + return normalizeDelegateType(createDatawaveGeometry((G) parseGeometry(geoString))); + } + + @Override + public T denormalize(String geoString) { + // this is assuming the input string is not actually normalized + // (which oddly is the case with other normalizers) + return createDatawaveGeometry((G) parseGeometry(geoString)); + } + + /** + * We cannot support regex against geometry fields + */ + @Override + public String normalizeRegex(String fieldRegex) throws IllegalArgumentException { + throw new IllegalArgumentException("Cannot normalize a regex against a geometry field"); + } + + @Override + public boolean normalizedRegexIsLossy(String in) { + throw new IllegalArgumentException("Cannot normalize a regex against a geometry field"); + } + + public String normalizeDelegateType(T geometry) { + return getEncodedStringFromIndexBytes(getSingleIndexFromGeometry(geometry)); + } + + public static String getEncodedStringFromIndexBytes(byte[] index) { + return Hex.encodeHexString(index); + } + + public static Geometry parseGeometry(String geoString) throws IllegalArgumentException { + for (GeometryParser geoParser : geoParsers) { + Geometry geom = geoParser.parseGeometry(geoString); + if (geom != null) + return geom; + } + throw new IllegalArgumentException("Cannot parse geometry from string [" + geoString + "]"); + } + + private byte[] getSingleIndexFromGeometry(T geometry) { + NumericIndexStrategy indexStrategy = getIndexStrategy(); + final List insertionIds = new ArrayList<>(); + for (MultiDimensionalNumericData range : GeometryUtils.basicConstraintsFromGeometry(geometry.getJTSGeometry()).getIndexConstraints(getIndex())) { + insertionIds.addAll(getIndexStrategy().getInsertionIds(range, 1).getCompositeInsertionIds()); + } + if (insertionIds.size() == 1) { + return insertionIds.get(0); + } + // this should never occur + throw new IllegalArgumentException("Cannot normalize input geometry, no resulting indices"); + } + + protected List getIndicesFromGeometry(T geometry) { + NumericIndexStrategy indexStrategy = getIndexStrategy(); + final List insertionIds = new ArrayList<>(); + for (MultiDimensionalNumericData range : GeometryUtils.basicConstraintsFromGeometry(geometry.getJTSGeometry()).getIndexConstraints(getIndex())) { + insertionIds.addAll(getIndexStrategy().getInsertionIds(range).getCompositeInsertionIds()); + } + return insertionIds; + } + + @Override + public Collection expand(String geoString) { + List indices = getIndicesFromGeometry(createDatawaveGeometry((G) parseGeometry(geoString))); + List retVal = new ArrayList<>(indices.size()); + for (byte[] index : indices) { + retVal.add(getEncodedStringFromIndexBytes(index)); + } + return retVal; + } + + @Override + public String incrementIndex(String index) { + String nextIndex = adjustHexRange(index, true); + return (nextIndex.length() != index.length()) ? index : nextIndex; + } + + @Override + public String decrementIndex(String index) { + String prevIndex = adjustHexRange(index, false); + return (prevIndex.length() != index.length()) ? index : prevIndex; + } + + @Override + public List discretizeRange(String beginIndex, String endIndex) { + List discreteIndices = new ArrayList<>(); + if (beginIndex.compareTo(endIndex) <= 0) { + if (beginIndex.length() == endIndex.length()) { + for (String nextIndex = beginIndex; nextIndex.compareTo(endIndex) <= 0; nextIndex = incrementIndex(nextIndex)) + discreteIndices.add(nextIndex); + } else { + discreteIndices.add(beginIndex); + discreteIndices.add(endIndex); + } + } + return discreteIndices; + } + + @Override + public boolean producesFixedLengthRanges() { + return true; + } + + private String adjustHexRange(String hexValue, boolean increment) { + int length = hexValue.length(); + String format = "%0" + hexValue.length() + "x"; + if (length < 8) { + return adjustHexRangeInteger(hexValue, format, increment); + } else if (length < 16) { + return adjustHexRangeLong(hexValue, format, increment); + } else { + return adjustHexRangeBigInteger(hexValue, format, increment); + } + } + + private String adjustHexRangeInteger(String hexValue, String format, boolean increment) { + return String.format(format, Integer.parseInt(hexValue, 16) + ((increment) ? 1 : -1)); + } + + private String adjustHexRangeLong(String hexValue, String format, boolean increment) { + return String.format(format, Long.parseLong(hexValue, 16) + ((increment) ? 1L : -1L)); + } + + private String adjustHexRangeBigInteger(String hexValue, String format, boolean increment) { + if (increment) + return String.format(format, new BigInteger(hexValue, 16).add(BigInteger.ONE)); + else + return String.format(format, new BigInteger(hexValue, 16).subtract(BigInteger.ONE)); + } + + /** + * This is used to determine if we have a valid geo hash (tier + position). NOTE: If we change the index strategy, then we will need to update this method + * appropriately. + * + * @param value + * @return true if valid + */ + public boolean validHash(String value) { + try { + short tier = getTier(value); + if (validTier(tier) && validLength(tier, value)) { + return validPosition(tier, getPosition(value)); + } + } catch (NumberFormatException e) { + // not a valid hex string in the first place + } + return false; + } + + public short getTier(String value) { + return Short.parseShort(value.substring(0, 2), 16); + } + + public long getPosition(String value) { + if (value.length() == 2) { + return 0; + } + return Long.parseLong(value.substring(2), 16); + } + + public boolean validTier(short tier) { + return tier >= 0 && tier <= 0x1f; + } + + public boolean validLength(short tier, String value) { + // determine the length of the position in hex characters + // ceil(tier/4) will get the number of bytes + int bytes = (tier >> 2) + ((tier & 0x3) == 0 ? 0 : 1); + + // multiply by 2 to get the number of hex digits + int posLen = 2 * bytes; + // length is the tier length plus the position length + return value.length() == (2 + posLen); + } + + public boolean validPosition(short tier, long value) { + // The maximum value must be less than pow(2, tier*2) + long max = 1L << (tier * 2); + return value >= 0 && value < max; + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/AbstractNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/AbstractNormalizer.java new file mode 100644 index 00000000000..fcfb658f901 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/AbstractNormalizer.java @@ -0,0 +1,17 @@ +package datawave.data.normalizer; + +import java.util.Collection; +import java.util.Collections; + +public abstract class AbstractNormalizer implements Normalizer { + + @Override + public Collection expand(String in) { + return Collections.singletonList(normalize(in)); + } + + @Override + public boolean normalizedRegexIsLossy(String in) { + return false; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/DateNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/DateNormalizer.java new file mode 100644 index 00000000000..f0498665bea --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/DateNormalizer.java @@ -0,0 +1,187 @@ +package datawave.data.normalizer; + +import java.text.DateFormat; +import java.text.ParseException; +import java.text.ParsePosition; +import java.text.SimpleDateFormat; +import java.util.Collection; +import java.util.Collections; +import java.util.Date; +import java.util.List; +import java.util.Map; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; + +public class DateNormalizer extends AbstractNormalizer { + + private static final long serialVersionUID = -3268331784114135470L; + private static final Logger log = LoggerFactory.getLogger(DateNormalizer.class); + public static final String ISO_8601_FORMAT_STRING = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"; + + public static final String[] FORMAT_STRINGS = {"EEE MMM dd HH:mm:ss zzz yyyy", // at the top just because + "EEE MMM dd HH:mm:ss XXX yyyy", // for ISO 8601 + ISO_8601_FORMAT_STRING, "yyyyMMddHHmmss", "yyyy-MM-dd HH:mm:ssz", "yyyy-MM-dd HH:mm:ss'Z'", "yyyy-MM-dd HH:mm:ss", "yyyy-MM-dd", + "yyyy-MM-dd'T'HH'|'mm", "yyyy-MM-dd'T'HH:mm:ss'Z'", "yyyy-MM-dd't'HH:mm:ss'z'", "yyyy-MM-dd'T'HH:mm:ssXXX", "yyyy-MM-dd'T'HH:mm:ss.SSSSSSS"}; + + private static final ThreadLocal> formatList = new ThreadLocal>() { + protected Map initialValue() { + return Maps.newHashMap(); + } + }; + + public String normalize(String fieldValue) { + Date fieldDate = parseToDate(fieldValue); + return parseToString(fieldDate); + } + + public static SimpleDateFormat getParser(String pattern) { + SimpleDateFormat parser = formatList.get().get(pattern); + if (parser == null) { + parser = new SimpleDateFormat(); + parser.setLenient(true); + parser.applyPattern(pattern); + formatList.get().put(pattern, parser); + } + return parser; + } + + public static String convertMicroseconds(String str, String pattern) { + // check for a special case where the incoming string is specifying microseconds instead of milliseconds + if (pattern.lastIndexOf('S') >= 0) { + // presuming the milliseconds is the last number in the string + int endMs = str.length(); + int startMs = -1; + for (int i = endMs - 1; i >= 0; i--) { + char c = str.charAt(i); + if (c >= '0' && c <= '9') { + startMs = i; + } else if (startMs == -1) { + endMs = i; + } else { + break; + } + } + // drop any characters after 3 digits + if (endMs - startMs > 3) { + str = str.substring(0, startMs + 3) + str.substring(endMs); + } + } + return str; + } + + public static Date parseDate(String str, String pattern) { + SimpleDateFormat parser = getParser(pattern); + ParsePosition pos = new ParsePosition(0); + str = convertMicroseconds(str, pattern); + Date date = parser.parse(str, pos); + if (date != null && pos.getIndex() == str.length()) { + return date; + } + return null; + } + + public static Date parseDate(String str, String[] parsePatterns) throws ParseException { + if (str != null && parsePatterns != null) { + for (int i = 0; i < parsePatterns.length; i++) { + Date date = parseDate(str, parsePatterns[i]); + if (date != null) { + return date; + } + } + + throw new ParseException("Unable to parse the date: " + str, -1); + } else { + throw new IllegalArgumentException("Date string nor patterns can be null"); + } + } + + private Date parseToDate(String fieldValue) { + try { + Date date = parseDate(fieldValue, FORMAT_STRINGS); + if (sanityCheck(date.getTime())) { + return date; + } + } catch (ParseException e) { + if (log.isTraceEnabled()) { + log.trace("Failed to normalize value using DateUtils: " + fieldValue); + } + } + + // see if fieldValue looks like a Long value + try { + boolean valid = true; + int size = fieldValue.length(); + long dateLong = 0; + for (int i = 0; i < size; i++) { + char c = fieldValue.charAt(i); + if (c >= '0' && c <= '9') { + dateLong *= 10; + dateLong += (c - '0'); + } else { + valid = false; + break; + } + } + if (valid && sanityCheck(dateLong)) { + return new Date(dateLong); + } + } catch (NumberFormatException e) { + // well, it's not a long + } + + throw new IllegalArgumentException("Failed to normalize value as a Date: " + fieldValue); + + } + + private boolean sanityCheck(Long dateLong) { + // between 1900/01/01 and 2100/12/31 + return -2208970800000L <= dateLong && dateLong < 4133894400000L; + } + + private Collection formatAll(Date date) { + List list = Lists.newArrayList(); + for (String format : FORMAT_STRINGS) { + DateFormat fs = getParser(format); + String formatted = fs.format(date); + if (formatted != null && !formatted.isEmpty()) { + list.add(formatted); + } + } + return list; + } + + public String parseToString(Date date) { + return getParser(ISO_8601_FORMAT_STRING).format(date); + } + + /** + * We cannot support regex against dates + */ + public String normalizeRegex(String fieldRegex) { + return fieldRegex; + } + + @Override + public String normalizeDelegateType(Date delegateIn) { + return parseToString(delegateIn); + } + + @Override + public Date denormalize(String in) { + return parseToDate(in); + } + + @Override + public Collection expand(String dateString) { + Date date = parseToDate(dateString); + if (date != null && this.sanityCheck(date.getTime())) { + return formatAll(date); + } + return Collections.emptyList(); + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/DiscreteIndexNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/DiscreteIndexNormalizer.java new file mode 100644 index 00000000000..263b2fee08b --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/DiscreteIndexNormalizer.java @@ -0,0 +1,54 @@ +package datawave.data.normalizer; + +import java.util.List; + +/** + * Contains a collection of useful methods which can be used against index entries which are discrete and calculable. + * + * @param + */ +public interface DiscreteIndexNormalizer extends Normalizer { + + /** + * Increments the given index to the next logical value. + * + * If producesFixedLengthRanges is true, and incrementIndex would cause the length of the index to change, the original index will be returned. + * + * @param index + * @return an incremented index + */ + String incrementIndex(String index); + + /** + * Decrements the given index to the previous logical value. + * + * If producesFixedLengthRanges is true, and decrementIndex would cause the length of the index to change, the original index will be returned. + * + * @param index + * @return a decremented index + */ + String decrementIndex(String index); + + /** + * Returns a list of all discrete values between begin and end. + * + * If producesFixedLengthRanges is true, the returned values will be of the same length as begin and end. + * + * If producesFixedLengthRanges is true, and begin and end are of different lengths, the original range will be returned. + * + * If begin does not come before end, an empty list will be returned. + * + * @param beginIndex + * @param endIndex + * @return a list of the discrete index values between begin and end + */ + List discretizeRange(String beginIndex, String endIndex); + + /** + * Indicates whether or not the ranges against the given indices will be of fixed length. That is to say, whether or not all index values within a given + * range will have the same string length. This is an important characteristic which enables composite ranges to be created. + * + * @return whether query ranges against these values will be of fixed length + */ + boolean producesFixedLengthRanges(); +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/GeoLatNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/GeoLatNormalizer.java new file mode 100644 index 00000000000..9a4d6b4c80f --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/GeoLatNormalizer.java @@ -0,0 +1,57 @@ +package datawave.data.normalizer; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import datawave.data.normalizer.GeoNormalizer.ParseException; +import datawave.data.type.util.NumericalEncoder; + +public class GeoLatNormalizer extends AbstractNormalizer { + + private static final long serialVersionUID = -1838190858989807274L; + private static final Logger log = LoggerFactory.getLogger(GeoLatNormalizer.class); + + public String normalize(String fieldValue) { + double val; + try { + val = GeoNormalizer.parseLatOrLon(fieldValue); + } catch (ParseException e) { + throw new IllegalArgumentException(e); + } + if (val < -90.0 || val > 90.0) { + throw new IllegalArgumentException("Latitude is outside of valid range [-90, 90]: " + val); + } + try { + return NumericalEncoder.encode(Double.toString(val)); + } catch (Exception e) { + throw new IllegalArgumentException("Failed to normalize value as a GeoLat: " + fieldValue); + } + } + + /** + * We cannot support regex against numbers + */ + public String normalizeRegex(String fieldRegex) { + throw new IllegalArgumentException("Cannot normalize a regex against a numeric field"); + } + + @Override + public String normalizeDelegateType(String delegateIn) { + return normalize(delegateIn); + } + + @Override + public String denormalize(String in) { + if (NumericalEncoder.isPossiblyEncoded(in)) { + try { + return NumericalEncoder.decode(in).toString(); + } catch (NumberFormatException e) { + if (log.isTraceEnabled()) { + log.trace("Error decoding value.", e); + } + } + } + return in; + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/GeoLonNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/GeoLonNormalizer.java new file mode 100644 index 00000000000..c77a662d467 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/GeoLonNormalizer.java @@ -0,0 +1,58 @@ +package datawave.data.normalizer; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import datawave.data.normalizer.GeoNormalizer.ParseException; +import datawave.data.type.util.NumericalEncoder; + +public class GeoLonNormalizer extends AbstractNormalizer { + + private static final long serialVersionUID = 2026515023484372154L; + private static final Logger log = LoggerFactory.getLogger(GeoLonNormalizer.class); + + public String normalize(String fieldValue) { + double val; + try { + val = GeoNormalizer.parseLatOrLon(fieldValue); + } catch (ParseException e) { + throw new IllegalArgumentException(e); + } + if (val < -180.0 || val > 180.0) { + throw new IllegalArgumentException("Longitude is outside of valid range [-180, 180]: " + val); + } + try { + return NumericalEncoder.encode(Double.toString(val)); + } catch (Exception e) { + throw new IllegalArgumentException("Failed to normalize value as a GeoLon: " + fieldValue); + } + } + + /** + * We cannot support regex against numbers + */ + + public String normalizeRegex(String fieldRegex) { + throw new IllegalArgumentException("Cannot normalize a regex against a numeric field"); + } + + @Override + public String normalizeDelegateType(String delegateIn) { + return normalize(delegateIn); + } + + @Override + public String denormalize(String in) { + if (NumericalEncoder.isPossiblyEncoded(in)) { + try { + return NumericalEncoder.decode(in).toString(); + } catch (NumberFormatException e) { + if (log.isTraceEnabled()) { + log.trace("Error decoding value.", e); + } + } + } + return in; + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/GeoNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/GeoNormalizer.java new file mode 100644 index 00000000000..6506b22a480 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/GeoNormalizer.java @@ -0,0 +1,533 @@ +package datawave.data.normalizer; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.text.NumberFormat; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; +import java.util.concurrent.ExecutionException; +import java.util.regex.Pattern; + +import org.apache.hadoop.io.Text; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Splitter; +import com.google.common.cache.CacheBuilder; +import com.google.common.cache.CacheLoader; +import com.google.common.cache.LoadingCache; +import com.google.common.collect.Iterables; + +import datawave.data.type.util.NumericalEncoder; + +/** + * A normalizer that, given a string of the format, [number][non-number character][number], will split the string at the non-numeric and interlace the left and + * right hand operands. Operands are normalized to contain + * + */ +public class GeoNormalizer extends AbstractNormalizer { + private static final long serialVersionUID = -1212607537051869786L; + + private static final Logger log = LoggerFactory.getLogger(GeoNormalizer.class); + + /* + * The z-order value that this normalize produces has no regard for precision. The prefix is always six digits and the suffix is always 16 digits + */ + private static final Pattern normalizedZRef = Pattern.compile("\\d{6}\\.\\.\\d+"); + + private static final LoadingCache isNormalizedCache = CacheBuilder.newBuilder().concurrencyLevel(32).maximumSize(10 * 1024) + .build(new CacheLoader() { + + @Override + public Boolean load(String key) { + boolean m = normalizedZRef.matcher(key).matches(); + if (log.isTraceEnabled()) + log.trace(key + " is " + (m ? "" : "not") + " a z-ref."); + return m; + } + + }); + + public static boolean isNormalized(String s) { + try { + return isNormalizedCache.get(s); + } catch (ExecutionException e) { + + } + return false; + } + + public static final String separator = "|"; + + /** + * Expects to receive a concatenated string. The string should be of the form: + * + * [number][non-numeric nor decimal dot][number] + * + * @throws IllegalArgumentException + * , if unable to parse the numbers on either side of the delimiter + */ + @Override + public String normalize(String fieldValue) throws IllegalArgumentException { + String normalized = fieldValue; + if (!isNormalized(fieldValue)) { + int split = findSplit(fieldValue); + if (split > 0) { + try { + normalized = combineLatLon(fieldValue.substring(0, split), fieldValue.substring(split + 1)); + } catch (Exception e) { + throw new IllegalArgumentException("Failed to normalize value as a Geo: " + fieldValue); + } + } else { + throw new IllegalArgumentException("Failed to normalize value as a Geo: " + fieldValue); + } + } + return normalized; + } + + /** + * Expects to receive a concatenated string. The string should be of the form: + * + * [number][non-numeric nor decimal dot][number] + * + * @throws IllegalArgumentException + * , if unable to parse the numbers on either side of the delimiter + */ + public double[] parseLatLon(String fieldValue) throws IllegalArgumentException { + int split = findSplit(fieldValue); + if (split > 0) { + try { + return new double[] {parseLatOrLon(fieldValue.substring(0, split)), parseLatOrLon(fieldValue.substring(split + 1, fieldValue.length()))}; + } catch (Exception e) { + throw new IllegalArgumentException("Failed to normalize value as a Geo: " + fieldValue); + } + } else { + throw new IllegalArgumentException("Failed to normalize value as a Geo: " + fieldValue); + } + } + + /** + * We cannot support regex against numbers + */ + @Override + public String normalizeRegex(String fieldRegex) throws IllegalArgumentException { + throw new IllegalArgumentException("Cannot normalize a regex against a numeric field"); + } + + @Override + public String normalizeDelegateType(String delegateIn) { + return normalize(delegateIn); + } + + @Override + public String denormalize(String in) { + return in; + } + + public String combineLatLon(String lat, String lon) throws OutOfRangeException, ParseException { + return combineLatLon(parseLatOrLon(lat), parseLatOrLon(lon)); + } + + private static final List DMS_DESIGNATORS = Arrays.asList('n', 's', 'e', 'w'); + + public static double parseLatOrLon(String value) throws ParseException { + value = value.trim(); + + // If we were given a zero-length value, catch this ahead of time so we + // can throw a ParseException instead of an IndexOutOfBoundsException + if (value.isEmpty()) { + throw new ParseException("Could not normalize empty value as latitute or longitude"); + } + + // value may have been encoded + try { + if (NumericalEncoder.isPossiblyEncoded(value)) { + value = NumericalEncoder.decode(value).toPlainString(); + } + } catch (Exception nfe) { + // ok, assume not normalized + } + + char end = Character.toLowerCase(value.charAt(value.length() - 1)); + if (DMS_DESIGNATORS.contains(end)) { + try { + return convertDMStoDD(value); + } catch (NormalizationException ne) { + throw new ParseException("Unable to convert DMS to DD format", ne); + } + } else { + try { + return parseDouble(value); + } catch (Exception nfe) { + throw new ParseException("Unable to parse lat or lon " + value, nfe); + } + } + } + + /** + * Convert a Degrees / Minutes / Seconds latitude or longitude into decimal degrees. + * + * @param val + * @return + */ + public static double convertDMStoDD(String val) throws NormalizationException { + try { + boolean negate = false; + double degrees = 0.0d; + double minutes = 0.0d; + double seconds = 0.0d; + + val = val.trim(); + char end = Character.toLowerCase(val.charAt(val.length() - 1)); + if (end == 'n' || end == 'e') { + val = val.substring(0, val.length() - 1).trim(); + } else if (end == 's' || end == 'w') { + val = val.substring(0, val.length() - 1).trim(); + negate = true; + } + + // see if it is already split up + if (val.indexOf(':') >= 0) { + String[] parts = Iterables.toArray(Splitter.on(':').split(val), String.class); + degrees = Double.parseDouble(parts[0]); + if (parts.length > 1) { + minutes = Double.parseDouble(parts[1]); + if (parts.length > 2) { + seconds = Double.parseDouble(parts[2]); + if (parts.length > 3) { + throw new NormalizationException("Do not know how to convert lat or lon value: " + val); + } + } + } + } else { + int point = val.indexOf('.'); + if (point < 0) + point = val.length(); + // if more than 3 digits, then we have minutes + if (point > 3) { + // if more than 5 digits, then we have seconds + if (point > 5) { + seconds = Double.parseDouble(val.substring(point - 2)); + minutes = Double.parseDouble(val.substring(point - 4, point - 2)); + degrees = Double.parseDouble(val.substring(0, point - 4)); + } else { + minutes = Double.parseDouble(val.substring(point - 2)); + degrees = Double.parseDouble(val.substring(0, point - 2)); + } + } else { + degrees = Double.parseDouble(val); + } + } + + double dd = degrees + (minutes / 60.0d) + (seconds / 3600.0d); + if (negate) { + dd = (0.0d - dd); + } + + return dd; + } catch (Exception nfe) { + throw new NormalizationException("Failed to convert numeric value part of a lat or lon " + val, nfe); + } + } + + public static double parseDouble(String val) throws ParseException { + double value = 0.0d; + try { + value = Double.parseDouble(val); + } catch (Exception e) { + if (NumericalEncoder.isPossiblyEncoded(val)) { + try { + value = NumericalEncoder.decode(val).doubleValue(); + } catch (Exception e2) { + // Don't log, since it's expected that we'll sometimes use this normalizer and pass bad values + // when we need to run an unknown type of term through all normalizers.s + throw new ParseException("Failed to convert " + val + " into a double value", e2); + } + } else { + throw new ParseException("Unknown double format: " + val); + } + } + return value; + } + + public String combineLatLon(double lat, double lon) throws OutOfRangeException { + return GeoPoint.getZRefStr(new GeoPoint(lat, lon)); + } + + /** + * Finds the first non numeric and non '.' character and returns its position. + * + * @param s + * @return + */ + public int findSplit(String s) { + if (separator != null) { + int i = s.indexOf(separator); + if (i > 0) { + return i; + } + } + // search from the center for a non lat or lon character + for (int i = 0; i < s.length(); ++i) { + int side = (i % 2 == 0 ? -1 : 1); + int dist = (i + 1) / 2; + int index = (s.length() / 2) + (dist * side); + if (index >= s.length()) + break; + char c = s.charAt(index); + if ((c > '9' || c < '0') && (c != '.' && c != '-' && c != '+') && (c != 'n' && c != 'N' && c != 's' && c != 'S') + && (c != 'e' && c != 'E' && c != 'w' && c != 'W')) { + return index; + } + } + return -1; + } + + public static class GeoPoint { + private double latitude, longitude; + + /** + * Creates a GeoPoint with a custom fraction precision. + * + * @param latitude + * @param longitude + */ + public GeoPoint(double latitude, double longitude) throws OutOfRangeException { + this.latitude = latitude; + this.longitude = longitude; + validate(); + } + + /** + * Creates a GeoPoint with a custom fraction precision. + * + * @param latitude + * @param longitude + * @throws ParseException + */ + public GeoPoint(String latitude, String longitude) throws OutOfRangeException, ParseException { + this.latitude = GeoNormalizer.parseDouble(latitude); + this.longitude = GeoNormalizer.parseDouble(longitude); + validate(); + } + + /** + * A validation routine that check the latitude and longitude ranges + * + * @throws IllegalArgumentException + * if an out of range is detected + */ + public void validate() throws OutOfRangeException { + if (this.latitude < -90.0 || this.latitude > 90.0) { + throw new OutOfRangeException("Latitude is outside of valid range [-90, 90]: " + this.latitude + ", " + this.longitude); + } + if (this.longitude < -180.0 || this.longitude > 180.0) { + throw new OutOfRangeException("Longitude is outside of valid range [-180, 180]: " + this.latitude + ", " + this.longitude); + } + } + + /** + * Returns an interlaced representation of the latitude and longitude. The latitude's normal range of -90:90 is shifted to 0:180 (+90) and the + * logitude's normal range of -180:180 has been shifted to 0:360. + *

+ * For example: + *

+ * {@code [45, -150] => [135, 30] => 103350..0000000000000000} + * + * @return + */ + public static Text getZRef(GeoPoint p) { + double latShift = p.latitude + 90.0; + double lonShift = p.longitude + 180.0; + + NumberFormat formatter = NumberFormat.getInstance(); + formatter.setMaximumIntegerDigits(3); + formatter.setMinimumIntegerDigits(3); + formatter.setMaximumFractionDigits(5); + formatter.setMinimumFractionDigits(5); + + String latS = formatter.format(latShift); + String lonS = formatter.format(lonShift); + + byte[] buf = new byte[latS.length() * 2]; + for (int i = 0; i < latS.length(); ++i) { + buf[2 * i] = (byte) latS.charAt(i); + buf[2 * i + 1] = (byte) lonS.charAt(i); + } + + return new Text(buf); + } + + /** + * Returns an interlaced representation of the latitude and longitude. The latitude's normal range of -90:90 is shifted to 0:180 (+90) and the + * logitude's normal range of -180:180 has been shifted to 0:360. + *

+ * For example: + *

+ * {@code [45, -150] => [135, 30] => 103350..0000000000000000} + * + * @return + */ + public static String getZRefStr(GeoPoint p) { + double latShift = p.latitude + 90.0; + double lonShift = p.longitude + 180.0; + + NumberFormat formatter = NumberFormat.getInstance(); + formatter.setMaximumIntegerDigits(3); + formatter.setMinimumIntegerDigits(3); + formatter.setMaximumFractionDigits(5); + formatter.setMinimumFractionDigits(5); + + String latS = formatter.format(latShift); + String lonS = formatter.format(lonShift); + StringBuilder sb = new StringBuilder(latS.length() * 2); + + for (int i = 0; i < latS.length(); ++i) { + sb.append(latS.charAt(i)); + sb.append(lonS.charAt(i)); + } + + return sb.toString(); + } + + /** + * Factory method for decoding a zReference from a Text object. + * + * @param zref + * @return + */ + public static GeoPoint decodeZRef(Text zref) throws OutOfRangeException, ParseException { + StringBuilder latB = new StringBuilder(); + StringBuilder lonB = new StringBuilder(); + + ByteBuffer data = ByteBuffer.wrap(zref.getBytes(), 0, zref.getLength()); + boolean isLat = true; + while (data.hasRemaining()) { + if (isLat) { + latB.append((char) data.get()); + } else { + lonB.append((char) data.get()); + } + isLat = !isLat; + } + + double lat = GeoNormalizer.parseDouble(latB.toString()); + double lon = GeoNormalizer.parseDouble(lonB.toString()); + + return new GeoPoint(lat - 90.0, lon - 180.0); + } + + /** + * Factory method for decoding a zReference from a Text object. + * + * @param zref + * @return + * @throws ParseException + */ + public static GeoPoint decodeZRef(String zref) throws OutOfRangeException, ParseException { + StringBuilder latB = new StringBuilder(); + StringBuilder lonB = new StringBuilder(); + + CharBuffer data = CharBuffer.wrap(zref); + boolean isLat = true; + while (data.hasRemaining()) { + if (isLat) { + latB.append(data.get()); + } else { + lonB.append(data.get()); + } + isLat = !isLat; + } + + double lat = GeoNormalizer.parseDouble(latB.toString()); + double lon = GeoNormalizer.parseDouble(lonB.toString()); + + return new GeoPoint(lat - 90.0, lon - 180.0); + } + + /** + * Given a bounding box described by the lower left corner (boundMind) and the upper right corner (boundMax), this method tests whether or not the point + * is within that box. + * + * @param boundMin + * @param boundMax + * @return + */ + public boolean within(GeoPoint boundMin, GeoPoint boundMax) { + return getLatitude() >= boundMin.getLatitude() && getLatitude() <= boundMax.getLatitude() && getLongitude() >= boundMin.getLongitude() + && getLongitude() <= boundMax.getLongitude(); + } + + public double getLatitude() { + return latitude; + } + + public double getLongitude() { + return longitude; + } + + @Override + public String toString() { + return "(" + getLongitude() + ", " + getLatitude() + ")"; + } + + @Override + public boolean equals(Object o) { + if (o == null) { + return false; + } else if (o instanceof GeoPoint) { + GeoPoint ogp = (GeoPoint) o; + return latitude == ogp.latitude && longitude == ogp.longitude; + } else { + return super.equals(o); + } + } + + @Override + public int hashCode() { + return Objects.hash(getLatitude(), getLongitude()); + } + } + + public static class OutOfRangeException extends Exception { + + public OutOfRangeException() { + super(); + } + + public OutOfRangeException(String message, Throwable cause) { + super(message, cause); + } + + public OutOfRangeException(String message) { + super(message); + } + + public OutOfRangeException(Throwable cause) { + super(cause); + } + + } + + public static class ParseException extends Exception { + + public ParseException() { + super(); + } + + public ParseException(String message, Throwable cause) { + super(message, cause); + } + + public ParseException(String message) { + super(message); + } + + public ParseException(Throwable cause) { + super(cause); + } + + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/GeometryNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/GeometryNormalizer.java new file mode 100644 index 00000000000..1200261de97 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/GeometryNormalizer.java @@ -0,0 +1,88 @@ +package datawave.data.normalizer; + +import java.util.List; + +import org.locationtech.geowave.core.geotime.index.dimension.LatitudeDefinition; +import org.locationtech.geowave.core.geotime.index.dimension.LongitudeDefinition; +import org.locationtech.geowave.core.index.NumericIndexStrategy; +import org.locationtech.geowave.core.index.dimension.NumericDimensionDefinition; +import org.locationtech.geowave.core.index.sfc.SFCFactory; +import org.locationtech.geowave.core.index.sfc.tiered.TieredSFCIndexFactory; +import org.locationtech.geowave.core.store.api.Index; +import org.locationtech.geowave.core.store.index.CustomNameIndex; + +import com.google.common.collect.Lists; + +import datawave.data.type.util.Geometry; + +/** + * A normalizer that, given a parseable geometry string representing an arbitrary geometry, will perform GeoWave indexing with a multi-tiered spatial geowave + * index configuration + */ +public class GeometryNormalizer extends AbstractGeometryNormalizer implements OneToManyNormalizer { + private static final long serialVersionUID = 171360806347433135L; + + // NOTE: If we change the index strategy, then we will need to update the validHash method appropriately. + // @formatter:off + public static final ThreadLocal indexStrategy = ThreadLocal.withInitial(GeometryNormalizer::createIndexStrategy); + // @formatter:on + + public static final ThreadLocal index = ThreadLocal.withInitial(() -> new CustomNameIndex(indexStrategy.get(), null, "geometryIndex")); + + protected static NumericIndexStrategy createIndexStrategy() { + // @formatter:off + return TieredSFCIndexFactory.createFullIncrementalTieredStrategy( + new NumericDimensionDefinition[]{ + new LongitudeDefinition(), + new LatitudeDefinition( + true) + // just use the same range for latitude to make square sfc values in + // decimal degrees (EPSG:4326) + }, + new int[]{ + LONGITUDE_BITS, + LATITUDE_BITS + }, + SFCFactory.SFCType.HILBERT); + // @formatter:on + } + + public NumericIndexStrategy getIndexStrategy() { + // NOTE: If we change the index strategy, then we will need to update the validHash method appropriately. + return GeometryNormalizer.indexStrategy.get(); + } + + public static NumericIndexStrategy getGeometryIndexStrategy() { + return GeometryNormalizer.indexStrategy.get(); + } + + public Index getIndex() { + return index.get(); + } + + public static Index getGeometryIndex() { + return index.get(); + } + + @Override + public List normalizeToMany(String geoString) throws IllegalArgumentException { + if (validHash(geoString)) { + return Lists.newArrayList(geoString); + } + return normalizeDelegateTypeToMany(createDatawaveGeometry(parseGeometry(geoString))); + } + + @Override + public List normalizeDelegateTypeToMany(Geometry geometry) { + List list = Lists.newArrayList(); + for (byte[] one : getIndicesFromGeometry(geometry)) { + list.add(getEncodedStringFromIndexBytes(one)); + } + return list; + } + + protected datawave.data.type.util.Geometry createDatawaveGeometry(org.locationtech.jts.geom.Geometry geometry) { + return new datawave.data.type.util.Geometry(geometry); + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/HexStringNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/HexStringNormalizer.java new file mode 100644 index 00000000000..762f09a5549 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/HexStringNormalizer.java @@ -0,0 +1,74 @@ +package datawave.data.normalizer; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class HexStringNormalizer extends AbstractNormalizer { + + private static final long serialVersionUID = -2056362158103923525L; + private static final Logger log = LoggerFactory.getLogger(HexStringNormalizer.class); + private final Pattern pattern; + + public HexStringNormalizer() { + this("(0x)?([0-9a-fA-F]+)"); + } + + protected HexStringNormalizer(String regex) { + pattern = Pattern.compile(regex); + } + + protected String getNormalizedHex(String hex) { + if (hex.length() % 2 == 0) { + return LC_NO_DIACRITICS_NORMALIZER.normalize(hex); + } + + StringBuilder buf = new StringBuilder(hex.length() + 1); + return LC_NO_DIACRITICS_NORMALIZER.normalize(buf.append("0").append(hex).toString()); + } + + protected Matcher validate(String fieldValue) { + if (StringUtils.isEmpty(fieldValue)) { + logAndThrow("Field may not be null or empty."); + } + + Matcher matcher = pattern.matcher(fieldValue); + if (!matcher.matches()) { + logAndThrow(String.format("Failed to normalize hex value : %s.", fieldValue)); + } + + return matcher; + } + + @Override + public String normalize(String fieldValue) { + Matcher matcher = validate(fieldValue); + + return getNormalizedHex(matcher.group(2)); + } + + private void logAndThrow(String msg) { + if (log.isDebugEnabled()) { + log.debug(msg); + } + throw new IllegalArgumentException(msg); + } + + @Override + public String normalizeRegex(String fieldRegex) { + return normalize(fieldRegex); + } + + @Override + public String normalizeDelegateType(String delegateIn) { + return normalize(delegateIn); + } + + @Override + public String denormalize(String in) { + return in; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/IpAddressNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/IpAddressNormalizer.java new file mode 100644 index 00000000000..885632dfbe0 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/IpAddressNormalizer.java @@ -0,0 +1,56 @@ +package datawave.data.normalizer; + +import org.apache.commons.net.util.SubnetUtils; + +import datawave.data.type.util.IpAddress; +import datawave.query.parser.JavaRegexAnalyzer; +import datawave.query.parser.JavaRegexAnalyzer.JavaRegexParseException; + +public class IpAddressNormalizer extends AbstractNormalizer { + + private static final long serialVersionUID = 8604032745289485764L; + + public String normalize(String fieldValue) { + try { + fieldValue = fieldValue.replaceAll(" ", ""); + return IpAddress.parse(fieldValue).toZeroPaddedString(); + } catch (IllegalArgumentException iae) { + throw new IpAddressNormalizer.Exception("Failed to normalize " + fieldValue + " as an IP"); + } + } + + /** + * Note that we really cannot normalize the regex here, so the regex must work against the normalized and unnormalized forms. + */ + public String normalizeRegex(String fieldRegex) { + try { + return new JavaRegexAnalyzer(fieldRegex).getZeroPadIpRegex(); + } catch (JavaRegexParseException jrpe) { + throw new IllegalArgumentException("Failed to parse ip regex " + fieldRegex, jrpe); + } + } + + public String[] normalizeCidrToRange(String cidr) { + SubnetUtils subnetUtils = new SubnetUtils(cidr); + subnetUtils.setInclusiveHostCount(true); + SubnetUtils.SubnetInfo info = subnetUtils.getInfo(); + return new String[] {normalize(info.getLowAddress()), normalize(info.getHighAddress())}; + } + + @Override + public String normalizeDelegateType(IpAddress delegateIn) { + return delegateIn.toZeroPaddedString(); + } + + @Override + public IpAddress denormalize(String in) { + return IpAddress.parse(in); + } + + public static class Exception extends IllegalArgumentException { + public Exception(String message) { + super(message); + } + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/LcNoDiacriticsNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/LcNoDiacriticsNormalizer.java new file mode 100644 index 00000000000..2dde04c4b98 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/LcNoDiacriticsNormalizer.java @@ -0,0 +1,74 @@ +package datawave.data.normalizer; + +import java.text.Normalizer; +import java.text.Normalizer.Form; +import java.util.Locale; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import datawave.query.parser.JavaRegexAnalyzer; +import datawave.query.parser.JavaRegexAnalyzer.JavaRegexParseException; + +/** + * A Normalizer which performs the following steps: + *

    + *
  1. Unicode canonical decomposition ({@link Form#NFD})
  2. + *
  3. Removal of diacritical marks
  4. + *
  5. Unicode canonical composition ({@link Form#NFC})
  6. + *
  7. lower casing in the {@link Locale#ENGLISH English local} + *
+ */ +public class LcNoDiacriticsNormalizer extends AbstractNormalizer { + private static final long serialVersionUID = -7922074256473963293L; + private static final Pattern diacriticals = Pattern.compile("\\p{InCombiningDiacriticalMarks}"); + + public String normalize(String fieldValue) { + if (null == fieldValue) { + return null; + } + String decomposed = Normalizer.normalize(fieldValue, Form.NFD); + String noDiacriticals = removeDiacriticalMarks(decomposed); + String recomposed = Normalizer.normalize(noDiacriticals, Form.NFC); + return recomposed.toLowerCase(Locale.ENGLISH); + } + + private String removeDiacriticalMarks(String str) { + Matcher matcher = diacriticals.matcher(str); + return matcher.replaceAll(""); + } + + public String normalizeRegex(String fieldRegex) { + if (null == fieldRegex) { + return null; + } + String decomposed = Normalizer.normalize(fieldRegex, Form.NFD); + String noDiacriticals = removeDiacriticalMarks(decomposed); + String recomposed = Normalizer.normalize(noDiacriticals, Form.NFC); + try { + JavaRegexAnalyzer regex = new JavaRegexAnalyzer(recomposed); + regex.applyRegexCaseSensitivity(false); + return regex.getRegex(); + } catch (JavaRegexParseException e) { + throw new IllegalArgumentException("Unable to parse regex " + fieldRegex, e); + } + } + + @Override + public boolean normalizedRegexIsLossy(String regex) { + // Despite this normalizer actually being lossy, we are still + // returning false as users are used to overmatching when including + // diacritics or upper case letter. We may consider changing this + // down the road, but for now returning false. + return false; + } + + @Override + public String normalizeDelegateType(String delegateIn) { + return normalize(delegateIn); + } + + @Override + public String denormalize(String in) { + return in; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/LcNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/LcNormalizer.java new file mode 100644 index 00000000000..ab678ec0aa4 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/LcNormalizer.java @@ -0,0 +1,50 @@ +package datawave.data.normalizer; + +import java.util.Locale; + +import datawave.query.parser.JavaRegexAnalyzer; +import datawave.query.parser.JavaRegexAnalyzer.JavaRegexParseException; + +/** + * + */ +public class LcNormalizer extends AbstractNormalizer { + + private static final long serialVersionUID = 8311875506912885780L; + + public String normalize(String fieldValue) { + return fieldValue.toLowerCase(Locale.ENGLISH); + } + + public String normalizeRegex(String fieldRegex) { + if (null == fieldRegex) { + return null; + } + try { + JavaRegexAnalyzer regex = new JavaRegexAnalyzer(fieldRegex); + regex.applyRegexCaseSensitivity(false); + return regex.getRegex(); + } catch (JavaRegexParseException e) { + throw new IllegalArgumentException("Unable to parse regex " + fieldRegex, e); + } + } + + @Override + public boolean normalizedRegexIsLossy(String regex) { + // Despite this normalizer actually being lossy, we are still + // returning false as users are used to overmatching when including + // diacritics or upper case letter. We may consider changing this + // down the road, but for now returning false. + return false; + } + + @Override + public String normalizeDelegateType(String delegateIn) { + return normalize(delegateIn); + } + + @Override + public String denormalize(String in) { + return in; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/MacAddressNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/MacAddressNormalizer.java new file mode 100644 index 00000000000..70631098f0e --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/MacAddressNormalizer.java @@ -0,0 +1,116 @@ +package datawave.data.normalizer; + +import org.apache.commons.lang3.StringUtils; + +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; + +public class MacAddressNormalizer extends AbstractNormalizer { + + private static final long serialVersionUID = -2606365671421121859L; + + public String normalize(String fieldValue) { + + String mac = ""; + + String parts[] = Iterables.toArray(Splitter.on(':').split(fieldValue), String.class); + if (parts.length == 6) { + // Verify it is padded ie.e 11:01:00:11:11:11 + // Return 11-01-00-11-11-11 + return StringUtils.join(padWithZeros(parts), "-"); + } + + parts = Iterables.toArray(Splitter.on('-').split(fieldValue), String.class); + if (parts.length == 6) { + + // Verify it is padded ie.e 11-01-00-11-11-11 + // Return 11-01-00-11-11-11 + return StringUtils.join(padWithZeros(parts), "-"); + } + + // 6 bytes for a macaddr + + try { + long lData = Long.parseLong(fieldValue, 16); + + if (!isMac(lData)) { + throw new IllegalArgumentException("Failed to normalize " + fieldValue + " as a MAC"); + } + + for (int i = 0; i < 6; i++) { + final String twoChars = Long.toHexString(lData & 0x00000000000000FFl); + lData = lData >> 8; + if (twoChars.length() == 1) { + mac = "0" + twoChars + mac; + + } else { + mac = twoChars + mac; + + } + mac = "-" + mac; + } + return (mac.substring(1)); + } catch (NumberFormatException e) { + throw new IllegalArgumentException("Failed to normalize " + fieldValue + " as a MAC"); + } + } + + /** + * Note that we really cannot normalize the regex here, so the regex must work against the normalized and unnormalized forms. + */ + public String normalizeRegex(String fieldRegex) { + return fieldRegex; + } + + public static boolean isMac(Long lData) { + + long mask = 0xFFFF000000000000l; + + if ((lData & mask) != 0) + return false; + + return true; + + } + + public static boolean isMac(String value) { + + long lData; + + try { + lData = Long.parseLong(value, 16); + + } catch (Exception e) { + return false; + } + + return isMac(lData); + + } + + private static String[] padWithZeros(String mac[]) { + String padded[] = new String[mac.length]; + + for (int i = 0; i < mac.length; i++) { + if (mac[i].length() == 1) { + padded[i] = "0" + mac[i]; + } else { + padded[i] = new String(mac[i]); + + } + } + + return padded; + + } + + @Override + public String normalizeDelegateType(String delegateIn) { + return normalize(delegateIn); + } + + @Override + public String denormalize(String in) { + return in; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/NetworkNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/NetworkNormalizer.java new file mode 100644 index 00000000000..444f9015d66 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/NetworkNormalizer.java @@ -0,0 +1,46 @@ +package datawave.data.normalizer; + +public class NetworkNormalizer extends AbstractNormalizer { + + private static final long serialVersionUID = 8279399353763569005L; + + public String normalize(String fieldValue) { + String normed = fieldValue; + + try { + normed = IP_ADDRESS_NORMALIZER.normalize(fieldValue); + + } catch (Exception iae) { + /** + * try as a mac address + */ + try { + normed = MAC_ADDRESS_NORMALIZER.normalize(fieldValue); + } catch (Exception e) { + /** + * ok, default to string normalization + */ + normed = LC_NO_DIACRITICS_NORMALIZER.normalize(fieldValue); + } + } + return normed; + } + + /** + * Note that we really cannot normalize the regex here, so the regex must work against the normalized and unnormalized forms. + */ + public String normalizeRegex(String fieldRegex) { + return fieldRegex; + } + + @Override + public String normalizeDelegateType(String delegateIn) { + return normalize(delegateIn); + } + + @Override + public String denormalize(String in) { + return in; + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/NoOpNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/NoOpNormalizer.java new file mode 100644 index 00000000000..5b8277cb3b6 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/NoOpNormalizer.java @@ -0,0 +1,27 @@ +package datawave.data.normalizer; + +/** + * + */ +public class NoOpNormalizer extends AbstractNormalizer { + + private static final long serialVersionUID = -2599171413081079348L; + + public String normalize(String fieldValue) { + return fieldValue; + } + + public String normalizeRegex(String fieldRegex) { + return fieldRegex; + } + + @Override + public String normalizeDelegateType(String delegateIn) { + return normalize(delegateIn); + } + + @Override + public String denormalize(String in) { + return in; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/NormalizationException.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/NormalizationException.java new file mode 100644 index 00000000000..f4727e15207 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/NormalizationException.java @@ -0,0 +1,25 @@ +package datawave.data.normalizer; + +import java.io.Serializable; + +public class NormalizationException extends Exception implements Serializable { + + private static final long serialVersionUID = -2700045630205135530L; + + public NormalizationException() { + super(); + } + + public NormalizationException(String message, Throwable cause) { + super(message, cause); + } + + public NormalizationException(String message) { + super(message); + } + + public NormalizationException(Throwable cause) { + super(cause); + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/Normalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/Normalizer.java new file mode 100644 index 00000000000..6ef0aaf407a --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/Normalizer.java @@ -0,0 +1,42 @@ +package datawave.data.normalizer; + +import java.io.Serializable; +import java.math.BigDecimal; +import java.util.Collection; +import java.util.Date; + +import datawave.data.type.util.Geometry; +import datawave.data.type.util.IpAddress; +import datawave.data.type.util.Point; + +public interface Normalizer extends Serializable { + + Normalizer IP_ADDRESS_NORMALIZER = new IpAddressNormalizer(); + Normalizer MAC_ADDRESS_NORMALIZER = new MacAddressNormalizer(); + Normalizer LC_NO_DIACRITICS_NORMALIZER = new LcNoDiacriticsNormalizer(); + Normalizer DATE_NORMALIZER = new DateNormalizer(); + Normalizer RAW_DATE_NORMALIZER = new RawDateNormalizer(); + Normalizer GEOMETRY_NORMALIZER = new GeometryNormalizer(); + Normalizer GEO_LAT_NORMALIZER = new GeoLatNormalizer(); + Normalizer GEO_LON_NORMALIZER = new GeoLonNormalizer(); + Normalizer GEO_NORMALIZER = new GeoNormalizer(); + Normalizer HEX_STRING_NORMALIZER = new HexStringNormalizer(); + Normalizer LC_NORMALIZER = new LcNormalizer(); + Normalizer NETWORK_NORMALIZER = new NetworkNormalizer(); + Normalizer NUMBER_NORMALIZER = new NumberNormalizer(); + Normalizer POINT_NORMALIZER = new PointNormalizer(); + Normalizer TRIM_LEADING_ZEROS_NORMALIZER = new TrimLeadingZerosNormalizer(); + Normalizer NOOP_NORMALIZER = new NoOpNormalizer(); + + String normalize(String in); + + String normalizeDelegateType(T delegateIn); + + T denormalize(String in); + + String normalizeRegex(String in); + + boolean normalizedRegexIsLossy(String in); + + Collection expand(String in); +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/NumberNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/NumberNormalizer.java new file mode 100644 index 00000000000..d1bb4eae6c2 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/NumberNormalizer.java @@ -0,0 +1,66 @@ +package datawave.data.normalizer; + +import java.math.BigDecimal; + +import org.apache.log4j.Logger; + +import datawave.data.normalizer.regex.NumericRegexEncoder; +import datawave.data.type.util.NumericalEncoder; + +public class NumberNormalizer extends AbstractNormalizer { + + private static final long serialVersionUID = -2781476072987375820L; + private Logger log = Logger.getLogger(NumberNormalizer.class); + + public String normalize(String fv) { + if (NumericalEncoder.isPossiblyEncoded(fv)) { + try { + NumericalEncoder.decode(fv); + return fv; + } catch (Exception e2) { + // no problem here, we will simply try to encode it below + } + } + try { + return NumericalEncoder.encode(fv); + } catch (Exception e) { + throw new IllegalArgumentException("Failed to normalize value as a number: " + fv); + } + } + + /** + * We can support regex against numbers. + */ + public String normalizeRegex(String fieldRegex) { + try { + return NumericRegexEncoder.encode(fieldRegex); + } catch (IllegalArgumentException e) { + log.debug("Failed to normalize numeric field pattern '" + fieldRegex + "', returning regex as is", e); + return fieldRegex; + } + } + + public boolean normalizedRegexIsLossy(String untrimmedRegex) { + ZeroRegexStatus status = NumericRegexEncoder.getZeroRegexStatus(untrimmedRegex); + + return (status.equals(ZeroRegexStatus.LEADING) || status.equals(ZeroRegexStatus.TRAILING)); + } + + @Override + public String normalizeDelegateType(BigDecimal delegateIn) { + return normalize(delegateIn.toString()); + } + + @Override + public BigDecimal denormalize(String in) { + if (NumericalEncoder.isPossiblyEncoded(in)) { + try { + return NumericalEncoder.decode(in); + } catch (NumberFormatException e) { + // not encoded... + } + } + return new BigDecimal(in); + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/OneToManyNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/OneToManyNormalizer.java new file mode 100644 index 00000000000..1ad641ce852 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/OneToManyNormalizer.java @@ -0,0 +1,10 @@ +package datawave.data.normalizer; + +import java.util.List; + +public interface OneToManyNormalizer extends Normalizer { + + List normalizeToMany(String in); + + List normalizeDelegateTypeToMany(T foo); +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/PointNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/PointNormalizer.java new file mode 100644 index 00000000000..51075af9d70 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/PointNormalizer.java @@ -0,0 +1,72 @@ +package datawave.data.normalizer; + +import org.locationtech.geowave.core.geotime.index.dimension.LatitudeDefinition; +import org.locationtech.geowave.core.geotime.index.dimension.LongitudeDefinition; +import org.locationtech.geowave.core.index.NumericIndexStrategy; +import org.locationtech.geowave.core.index.dimension.NumericDimensionDefinition; +import org.locationtech.geowave.core.index.sfc.SFCFactory; +import org.locationtech.geowave.core.index.sfc.tiered.TieredSFCIndexFactory; +import org.locationtech.geowave.core.store.api.Index; +import org.locationtech.geowave.core.store.index.CustomNameIndex; + +import datawave.data.type.util.Point; + +/** + * A normalizer that, given a parseable geometry string representing a point geometry will perform GeoWave indexing with a single-tier spatial geowave index + * configuration + */ +public class PointNormalizer extends AbstractGeometryNormalizer { + private static final long serialVersionUID = 171360806347433135L; + + // NOTE: If we change the index strategy, then we will need to update the validHash method appropriately. + // @formatter:off + public static final ThreadLocal indexStrategy = ThreadLocal.withInitial(PointNormalizer::createIndexStrategy); + // @formatter:on + + protected static NumericIndexStrategy createIndexStrategy() { + // @formatter:off + return TieredSFCIndexFactory.createSingleTierStrategy( + new NumericDimensionDefinition[]{ + new LongitudeDefinition(), + new LatitudeDefinition( + true) + // just use the same range for latitude to make square sfc values in + // decimal degrees (EPSG:4326) + }, + new int[]{ + LONGITUDE_BITS, + LATITUDE_BITS + }, + SFCFactory.SFCType.HILBERT); + // @formatter:on + } + + public static final ThreadLocal index = ThreadLocal.withInitial(() -> new CustomNameIndex(indexStrategy.get(), null, "pointIndex")); + + public NumericIndexStrategy getIndexStrategy() { + // NOTE: If we change the index strategy, then we will need to update the validHash method appropriately. + return PointNormalizer.indexStrategy.get(); + } + + public static NumericIndexStrategy getPointIndexStrategy() { + return PointNormalizer.indexStrategy.get(); + } + + public Index getIndex() { + return index.get(); + } + + public static Index getPointIndex() { + return index.get(); + } + + protected Point createDatawaveGeometry(org.locationtech.jts.geom.Point geometry) { + return new Point(geometry); + } + + @Override + public boolean validTier(short tier) { + return tier == 0x1f; + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/RawDateNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/RawDateNormalizer.java new file mode 100644 index 00000000000..d318ec8f4be --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/RawDateNormalizer.java @@ -0,0 +1,34 @@ +package datawave.data.normalizer; + +import java.util.Collection; + +public class RawDateNormalizer extends AbstractNormalizer { + + private static final long serialVersionUID = -3268331784114135470L; + private DateNormalizer delegate = new DateNormalizer(); + + @Override + public String normalize(String fieldValue) { + return delegate.normalize(fieldValue); + } + + public String normalizeRegex(String fieldRegex) { + return delegate.normalizeRegex(fieldRegex); + } + + @Override + public String normalizeDelegateType(String delegateIn) { + return delegate.normalize(delegateIn); + } + + @Override + public String denormalize(String in) { + return in; + } + + @Override + public Collection expand(String dateString) { + return delegate.expand(dateString); + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/TrimLeadingZerosNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/TrimLeadingZerosNormalizer.java new file mode 100644 index 00000000000..2adb0ff6002 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/TrimLeadingZerosNormalizer.java @@ -0,0 +1,34 @@ +package datawave.data.normalizer; + +public class TrimLeadingZerosNormalizer extends AbstractNormalizer { + + private static final long serialVersionUID = -5681890794025882300L; + + public String normalize(String fv) { + int len = fv.length(); + int index; + for (index = 0; (index < len) && (fv.charAt(index) == '0'); index++) + ; + if (index > 0) { + fv = fv.substring(index); + } + return fv; + } + + /** + * Note that we really cannot normalize the regex here, so the regex must work against the normalized and unnormalized forms. + */ + public String normalizeRegex(String fieldRegex) { + return fieldRegex; + } + + @Override + public String normalizeDelegateType(String delegateIn) { + return normalize(delegateIn); + } + + @Override + public String denormalize(String in) { + return in; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/ZeroRegexStatus.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/ZeroRegexStatus.java new file mode 100644 index 00000000000..2e6f43a155d --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/ZeroRegexStatus.java @@ -0,0 +1,5 @@ +package datawave.data.normalizer; + +public enum ZeroRegexStatus { + LEADING, TRAILING, NONE +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/AlternationNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/AlternationNode.java new file mode 100644 index 00000000000..8be4b30fa39 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/AlternationNode.java @@ -0,0 +1,38 @@ +package datawave.data.normalizer.regex; + +import java.util.Collection; +import java.util.Map; + +import datawave.data.normalizer.regex.visitor.Visitor; + +/** + * Represents a regex alternation, i.e. {@code |}. + */ +public class AlternationNode extends Node { + + public AlternationNode() {} + + public AlternationNode(Collection children) { + super(children); + } + + public AlternationNode(Map properties) { + super(properties); + } + + @Override + public NodeType getType() { + return NodeType.ALTERNATION; + } + + @Override + public Object accept(Visitor visitor, Object data) { + return visitor.visitAlternation(this, data); + } + + @Override + public AlternationNode shallowCopy() { + return new AlternationNode(this.properties); + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/AnyCharNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/AnyCharNode.java new file mode 100644 index 00000000000..5c30c3f90e0 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/AnyCharNode.java @@ -0,0 +1,32 @@ +package datawave.data.normalizer.regex; + +import java.util.Map; + +import datawave.data.normalizer.regex.visitor.Visitor; + +/** + * Represents a dot in a regex pattern. + */ +public class AnyCharNode extends Node { + + public AnyCharNode() {} + + public AnyCharNode(Map properties) { + super(properties); + } + + @Override + public NodeType getType() { + return NodeType.ANY_CHAR; + } + + @Override + public Object accept(Visitor visitor, Object data) { + return visitor.visitAnyChar(this, data); + } + + @Override + public AnyCharNode shallowCopy() { + return new AnyCharNode(this.properties); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/CharClassNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/CharClassNode.java new file mode 100644 index 00000000000..0c6350c4b22 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/CharClassNode.java @@ -0,0 +1,48 @@ +package datawave.data.normalizer.regex; + +import java.util.Map; +import java.util.Objects; + +import datawave.data.normalizer.regex.visitor.Visitor; + +/** + * Represents a character class in a regex pattern encapsulated by {@code [...]}. + */ +public class CharClassNode extends Node { + + public static final String PROPERTY_NEGATED = "negated"; + private static final String TRUE = String.valueOf(true); + + public CharClassNode() {} + + public CharClassNode(boolean negated) { + setProperty(PROPERTY_NEGATED, String.valueOf(negated)); + } + + public CharClassNode(Map properties) { + super(properties); + } + + public boolean isNegated() { + return hasProperty(PROPERTY_NEGATED) && getProperty(PROPERTY_NEGATED).equals(TRUE); + } + + public void negate() { + setProperty(PROPERTY_NEGATED, TRUE); + } + + @Override + public NodeType getType() { + return NodeType.CHAR_CLASS; + } + + @Override + public Object accept(Visitor visitor, Object data) { + return visitor.visitCharClass(this, data); + } + + @Override + public CharClassNode shallowCopy() { + return new CharClassNode(this.properties); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/CharRangeNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/CharRangeNode.java new file mode 100644 index 00000000000..4538c3d7794 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/CharRangeNode.java @@ -0,0 +1,59 @@ +package datawave.data.normalizer.regex; + +import java.util.Map; +import java.util.Objects; +import java.util.StringJoiner; + +import datawave.data.normalizer.regex.visitor.Visitor; + +/** + * Represents a character range defined in a character class in a regex pattern. + */ +public class CharRangeNode extends Node { + + public static final String PROPERTY_START = "start"; + public static final String PROPERTY_END = "end"; + + public CharRangeNode() {} + + public CharRangeNode(Map properties) { + super(properties); + } + + public CharRangeNode(char start, char end) { + setStart(start); + setEnd(end); + } + + public char getStart() { + return getProperty(PROPERTY_START).charAt(0); + } + + public void setStart(char start) { + setProperty(PROPERTY_START, String.valueOf(start)); + } + + public char getEnd() { + return getProperty(PROPERTY_END).charAt(0); + } + + public void setEnd(char end) { + setProperty(PROPERTY_END, String.valueOf(end)); + } + + @Override + public NodeType getType() { + return NodeType.CHAR_RANGE; + } + + @Override + public Object accept(Visitor visitor, Object data) { + return visitor.visitCharRange(this, data); + } + + @Override + public CharRangeNode shallowCopy() { + return new CharRangeNode(this.properties); + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/DigitCharClassNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/DigitCharClassNode.java new file mode 100644 index 00000000000..d77e2ccab97 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/DigitCharClassNode.java @@ -0,0 +1,34 @@ +package datawave.data.normalizer.regex; + +import java.util.Map; + +import datawave.data.normalizer.regex.visitor.Visitor; + +/** + * Represents the digit character class {@code \d} in a regex pattern. + */ +public class DigitCharClassNode extends Node { + + protected DigitCharClassNode() { + super(); + } + + public DigitCharClassNode(Map properties) { + super(properties); + } + + @Override + public NodeType getType() { + return NodeType.DIGIT_CHAR_CLASS; + } + + @Override + public Object accept(Visitor visitor, Object data) { + return visitor.visitDigitChar(this, data); + } + + @Override + public DigitCharClassNode shallowCopy() { + return new DigitCharClassNode(this.properties); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EmptyNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EmptyNode.java new file mode 100644 index 00000000000..f0aa0f983aa --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EmptyNode.java @@ -0,0 +1,32 @@ +package datawave.data.normalizer.regex; + +import java.util.Map; + +import datawave.data.normalizer.regex.visitor.Visitor; + +/** + * Placeholder empty node for empty groups or empty alternation branches. + */ +public class EmptyNode extends Node { + + public EmptyNode() {} + + public EmptyNode(Map properties) { + super(properties); + } + + @Override + public NodeType getType() { + return NodeType.EMPTY; + } + + @Override + public Object accept(Visitor visitor, Object data) { + return visitor.visitEmpty(this, data); + } + + @Override + public Node shallowCopy() { + return new EmptyNode(this.properties); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EncodedNumberNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EncodedNumberNode.java new file mode 100644 index 00000000000..7fdfa375996 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EncodedNumberNode.java @@ -0,0 +1,37 @@ +package datawave.data.normalizer.regex; + +import java.util.Collection; +import java.util.Map; + +import datawave.data.normalizer.regex.visitor.Visitor; + +/** + * Represents an encoded simple number in a regex tree. + */ +public class EncodedNumberNode extends Node { + + public EncodedNumberNode() {} + + public EncodedNumberNode(Collection children) { + addChildren(children); + } + + public EncodedNumberNode(Map properties) { + super(properties); + } + + @Override + public NodeType getType() { + return NodeType.ENCODED_NUMBER; + } + + @Override + public Object accept(Visitor visitor, Object data) { + return visitor.visitEncodedNumber(this, data); + } + + @Override + public Node shallowCopy() { + return new EncodedNumberNode(this.properties); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EncodedPatternNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EncodedPatternNode.java new file mode 100644 index 00000000000..ce40d4ec7d8 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EncodedPatternNode.java @@ -0,0 +1,41 @@ +package datawave.data.normalizer.regex; + +import java.util.Collection; +import java.util.Map; + +import datawave.data.normalizer.regex.visitor.Visitor; + +/** + * Represents an encoded pattern in a regex tree. + */ +public class EncodedPatternNode extends Node { + + public EncodedPatternNode() {} + + public EncodedPatternNode(Node child) { + super(child); + } + + public EncodedPatternNode(Collection children) { + super(children); + } + + public EncodedPatternNode(Map properties) { + super(properties); + } + + @Override + public NodeType getType() { + return NodeType.ENCODED_PATTERN; + } + + @Override + public Object accept(Visitor visitor, Object data) { + return visitor.visitEncodedPattern(this, data); + } + + @Override + public Node shallowCopy() { + return new EncodedPatternNode(this.properties); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EndAnchorNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EndAnchorNode.java new file mode 100644 index 00000000000..e645d597ece --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EndAnchorNode.java @@ -0,0 +1,32 @@ +package datawave.data.normalizer.regex; + +import java.util.Map; + +import datawave.data.normalizer.regex.visitor.Visitor; + +/** + * Represents a regex end anchor, i.e. {@code $}. + */ +public class EndAnchorNode extends Node { + + public EndAnchorNode() {} + + public EndAnchorNode(Map properties) { + super(properties); + } + + @Override + public NodeType getType() { + return NodeType.END_ANCHOR; + } + + @Override + public Object accept(Visitor visitor, Object data) { + return visitor.visitEndAnchor(this, data); + } + + @Override + public Node shallowCopy() { + return new EndAnchorNode(this.properties); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EscapedSingleCharNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EscapedSingleCharNode.java new file mode 100644 index 00000000000..1f4ae4d9654 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EscapedSingleCharNode.java @@ -0,0 +1,46 @@ +package datawave.data.normalizer.regex; + +import java.util.Map; + +import datawave.data.normalizer.regex.visitor.Visitor; + +/** + * Represents an escaped character in a regex pattern, e.g. {@code \-}. + */ +public class EscapedSingleCharNode extends Node { + + public static final String PROPERTY_CHAR = "char"; + + public EscapedSingleCharNode() {} + + public EscapedSingleCharNode(char character) { + setCharacter(character); + } + + public EscapedSingleCharNode(Map properties) { + super(properties); + } + + public char getCharacter() { + return getProperty(PROPERTY_CHAR).charAt(0); + } + + public void setCharacter(char character) { + setProperty(PROPERTY_CHAR, String.valueOf(character)); + } + + @Override + public NodeType getType() { + return NodeType.ESCAPED_SINGLE_CHAR; + } + + @Override + public Object accept(Visitor visitor, Object data) { + return visitor.visitEscapedSingleChar(this, data); + } + + @Override + public EscapedSingleCharNode shallowCopy() { + return new EscapedSingleCharNode(this.properties); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/ExpressionNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/ExpressionNode.java new file mode 100644 index 00000000000..ec00add8f7b --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/ExpressionNode.java @@ -0,0 +1,41 @@ +package datawave.data.normalizer.regex; + +import java.util.List; +import java.util.Map; + +import datawave.data.normalizer.regex.visitor.Visitor; + +/** + * Represents some subset or the full part of a regex pattern. + */ +public class ExpressionNode extends Node { + + public ExpressionNode() {} + + public ExpressionNode(Node child) { + super(child); + } + + public ExpressionNode(List children) { + super(children); + } + + public ExpressionNode(Map properties) { + super(properties); + } + + @Override + public NodeType getType() { + return NodeType.EXPRESSION; + } + + @Override + public Object accept(Visitor visitor, Object data) { + return visitor.visitExpression(this, data); + } + + @Override + public ExpressionNode shallowCopy() { + return new ExpressionNode(this.properties); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/GroupNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/GroupNode.java new file mode 100644 index 00000000000..52d32c9bb02 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/GroupNode.java @@ -0,0 +1,41 @@ +package datawave.data.normalizer.regex; + +import java.util.List; +import java.util.Map; + +import datawave.data.normalizer.regex.visitor.Visitor; + +/** + * Represents a regex group in a regex pattern encapsulated by {@code (...)}. + */ +public class GroupNode extends Node { + + public GroupNode() {} + + public GroupNode(Node child) { + super(child); + } + + public GroupNode(List children) { + super(children); + } + + public GroupNode(Map properties) { + super(properties); + } + + @Override + public NodeType getType() { + return NodeType.GROUP; + } + + @Override + public Object accept(Visitor visitor, Object data) { + return visitor.visitGroup(this, data); + } + + @Override + public GroupNode shallowCopy() { + return new GroupNode(this.properties); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/IntegerNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/IntegerNode.java new file mode 100644 index 00000000000..2dbee0a54c6 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/IntegerNode.java @@ -0,0 +1,48 @@ +package datawave.data.normalizer.regex; + +import java.util.Map; +import java.util.Objects; +import java.util.StringJoiner; + +import datawave.data.normalizer.regex.visitor.Visitor; + +/** + * Represents an integer parsed in a regex repetition that did not contain a range, e.g. {@code {3}}. + */ +public class IntegerNode extends Node { + + public static final String PROPERTY_VALUE = "value"; + + public IntegerNode() {} + + public IntegerNode(int value) { + setValue(value); + } + + public IntegerNode(Map properties) { + super(properties); + } + + public int getValue() { + return Integer.parseInt(getProperty(PROPERTY_VALUE)); + } + + public void setValue(int value) { + setProperty(PROPERTY_VALUE, String.valueOf(value)); + } + + @Override + public NodeType getType() { + return NodeType.INTEGER; + } + + @Override + public Object accept(Visitor visitor, Object data) { + return visitor.visitInteger(this, data); + } + + @Override + public IntegerNode shallowCopy() { + return new IntegerNode(properties); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/IntegerRangeNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/IntegerRangeNode.java new file mode 100644 index 00000000000..fa264b0ab46 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/IntegerRangeNode.java @@ -0,0 +1,67 @@ +package datawave.data.normalizer.regex; + +import java.util.Map; +import java.util.Objects; +import java.util.StringJoiner; + +import datawave.data.normalizer.regex.visitor.Visitor; + +/** + * Represents an integer range parsed from a regex repetition that specified a range, e.g. {@code {3,}} or {@code {3,10}}. + */ +public class IntegerRangeNode extends Node { + + public static final String PROPERTY_START = "start"; + public static final String PROPERTY_END = "end"; + + public IntegerRangeNode() {} + + public IntegerRangeNode(int start, Integer end) { + setStart(start); + setEnd(end); + } + + public IntegerRangeNode(Map properties) { + super(properties); + } + + public int getStart() { + return Integer.parseInt(getProperty(PROPERTY_START)); + } + + public void setStart(int start) { + setProperty(PROPERTY_START, String.valueOf(start)); + } + + public Integer getEnd() { + if (hasProperty(PROPERTY_END)) { + return Integer.valueOf(getProperty(PROPERTY_END)); + } + return null; + } + + public void setEnd(Integer end) { + if (end != null) { + setProperty(PROPERTY_END, String.valueOf(end)); + } + } + + public boolean isEndBounded() { + return hasProperty(PROPERTY_END); + } + + @Override + public NodeType getType() { + return NodeType.INTEGER_RANGE; + } + + @Override + public Object accept(Visitor visitor, Object data) { + return visitor.visitIntegerRange(this, data); + } + + @Override + public IntegerRangeNode shallowCopy() { + return new IntegerRangeNode(this.properties); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/Node.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/Node.java new file mode 100644 index 00000000000..30d50417981 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/Node.java @@ -0,0 +1,333 @@ +package datawave.data.normalizer.regex; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; + +import datawave.data.normalizer.regex.visitor.Visitor; + +public abstract class Node { + + protected Node parent; + protected Map properties; + protected ArrayList children = new ArrayList<>(); + + protected Node() {} + + @SuppressWarnings("CopyConstructorMissesField") + protected Node(Node child) { + addChild(child); + } + + protected Node(Map properties) { + if (properties != null) { + this.properties = new HashMap<>(); + this.properties.putAll(properties); + } + } + + protected Node(Collection children) { + addChildren(children); + } + + /** + * Return the node type. + * + * @return the type + */ + public abstract NodeType getType(); + + /** + * Return the parent of this {@link Node}. Possibly null if a parent was never set. + * + * @return the parent + */ + public Node getParent() { + return parent; + } + + /** + * Set the parent for this node. + * + * @param parent + * the parent + */ + public void setParent(Node parent) { + this.parent = parent; + } + + public boolean hasProperties() { + return properties != null; + } + + public boolean hasProperty(String key) { + return hasProperties() && properties.containsKey(key); + } + + public String getProperty(String key) { + return properties.get(key); + } + + public void setProperty(String key, String value) { + if (properties == null) { + properties = new HashMap<>(); + } + properties.put(key, value); + } + + public void setProperties(Map properties) { + if (properties != null) { + if (this.properties == null) { + this.properties = new HashMap<>(); + } + this.properties.putAll(properties); + } + } + + /** + * Return the children of this {@link Node}. Possibly empty, but never null. + * + * @return the children + */ + public List getChildren() { + return children; + } + + /** + * Set the children for this {@link Node}. If the given list is null, the list of children for this node will be cleared. + * + * @param children + * the children + */ + public void setChildren(Collection children) { + this.children.clear(); + if (children != null) { + children.forEach(this::addChild); + } + } + + /** + * Add a child to the end of the list of children for this node. + * + * @param child + * the child to add + */ + public void addChild(Node child) { + this.children.add(child); + child.parent = this; + } + + /** + * Add a child to this node at the specified index. Shifts the child at the specified index and any subsequent children to the right by one index. + * + * @param child + * the child to insert + * @param index + * the index at which the child is to be inserted + */ + public void addChild(Node child, int index) { + this.children.add(index, child); + child.parent = this; + } + + /** + * Add each node in the given list to the end of the list of children for this node. + * + * @param children + * the children to add + */ + public void addChildren(Collection children) { + children.forEach(this::addChild); + } + + /** + * Return the child at the specified index in this node's list of children. + * + * @param index + * the index + * @return the child + */ + public Node getChildAt(int index) { + return children.get(index); + } + + /** + * Return the number of children this node has. + * + * @return the total number of children + */ + public int getChildCount() { + return children.size(); + } + + /** + * Return whether this node has any children. + * + * @return true if this node has at least one child, or false otherwise + */ + public boolean hasChildren() { + return !children.isEmpty(); + } + + /** + * Returns whether this node is a leaf, that is, whether it has no children. + * + * @return true if this node has no children, or false otherwise + */ + public boolean isLeaf() { + return children.size() == 0; + } + + /** + * Accepts the given visitor and passes itself to the appropriate method in the {@link Visitor} with the given data. + * + * @param visitor + * the visitor + * @param data + * the data + * @return the result from the visitor + */ + public abstract Object accept(Visitor visitor, Object data); + + /** + * Passes the visitor to each child in this node for the child to accept. + * + * @param visitor + * the visitor + * @param data + * the data + * @return the data + */ + public Object childrenAccept(Visitor visitor, Object data) { + children.forEach((child) -> child.accept(visitor, data)); + return data; + } + + /** + * Return a shallow copy of the node of the same type with all relevant attributes except for the parent and children. + * + * @return the shallow copy + */ + public abstract Node shallowCopy(); + + /** + * Return whether any child of this node an instance of a type not found in the given types. + * + * @param types + * the types + * @return true if any child of this node is a type not found in the given types, or false otherwise + */ + public boolean isAnyChildNotOf(Set> types) { + return children.stream().map(Node::getClass).anyMatch((t) -> !types.contains(t)); + } + + /** + * Return whether any child of this node is an instance of the given type. + * + * @param type + * the type + * @return true if any child of this node is an instance of the given type, or false otherwise + */ + public boolean isAnyChildOf(Class type) { + return children.stream().anyMatch(type::isInstance); + } + + /** + * Returns the index within this node of the first child of the specified type. If no child of the specified type exists in this node, -1 is returned. + * + * @param type + * the type + * @return the index of the first child of the specified type, or -1 if no child of the type is found + */ + public int indexOf(Class type) { + return indexOf(type, 0); + } + + /** + * Returns the index within this node of the first child of the specified type, starting the search at the specified index. If no child of the specified + * type exists at or after position {@code fromIndex}, -1 is returned. + * + * @param type + * the type + * @param fromIndex + * the index to start the search from + * @return the index of the first child of the specified type that is greater than or equal to {@code fromIndex}, or -1 if no child of the type is found + */ + public int indexOf(Class type, int fromIndex) { + for (int i = fromIndex; i < children.size(); i++) { + if (type.isInstance(children.get(i))) { + return i; + } + } + return -1; + } + + /** + * Return the first child of this node, or null if this node has no children. + * + * @return the first node, possibly null + */ + public Node getFirstChild() { + return children.isEmpty() ? null : children.get(0); + } + + /** + * Return the last child of this node, or null if this node has no children. + * + * @return the last node, possibly null + */ + public Node getLastChild() { + return children.isEmpty() ? null : children.get((children.size() - 1)); + } + + /** + * Removes the first child from this node. + * + * @throws IndexOutOfBoundsException + * if there are no children + */ + public void removeFirstChild() { + children.remove(0); + } + + /** + * Return a new {@link NodeListIterator} instance that will traverse over this node's children. + * + * @return a new iterator + */ + public NodeListIterator getChildrenIterator() { + return new NodeListIterator(this.children); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + Node node = (Node) o; + return Objects.equals(properties, node.properties); + } + + @Override + public int hashCode() { + return Objects.hash(properties); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(getClass().getSimpleName()); + if (properties != null) { + sb.append("(").append(properties).append(")"); + } + return sb.toString(); + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/NodeListIterator.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/NodeListIterator.java new file mode 100644 index 00000000000..94980668530 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/NodeListIterator.java @@ -0,0 +1,192 @@ +package datawave.data.normalizer.regex; + +import java.util.Collection; +import java.util.List; +import java.util.NoSuchElementException; + +/** + * An iterator for traversing over a list of {@link Node} instances, with functionality for skipping over nodes that meet certain conditions. + */ +public class NodeListIterator { + + /** + * The list. + */ + private final List nodes; + + /** + * The current index. + */ + private int index; + + public NodeListIterator(List nodes) { + this.nodes = nodes; + } + + /** + * Return the current iterator index. + * + * @return the index + */ + public int index() { + return index; + } + + /** + * Set the current index for the iterator. + * + * @param index + * the index + */ + public void setIndex(int index) { + this.index = index; + } + + /** + * Return true if there are more nodes to return from the list. + * + * @return true if there are a next node to return + */ + public boolean hasNext() { + return this.index < this.nodes.size(); + } + + /** + * Return the next node from the list. + * + * @return the next node + * @throws NoSuchElementException + * if there is no next node + */ + public Node next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + return nodes.get(index++); + } + + /** + * Return the next node from the list without modifying the current iterator index. + * + * @return the next node + * @throws NoSuchElementException + * if there is no next node + */ + public Node peekNext() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + return nodes.get((index)); + } + + /** + * Return whether the next node is an instance of the given type. + * + * @param type + * the type + * @return true if the next node is an instance of the type, or false otherwise + * @throws NoSuchElementException + * if there is no next node + */ + public boolean isNextInstanceOf(Class type) { + return type.isInstance(peekNext()); + } + + /** + * Return whether the next node is an instance of one of the given types. + * + * @param types + * the types + * @return true if the next node is an instance of one of the given types, or false otherwise + * @throws NoSuchElementException + * if there is no next node + */ + public boolean isNextInstanceOfAny(Collection> types) { + Node previous = peekNext(); + return types.stream().anyMatch((type) -> type.isInstance(previous)); + } + + /** + * Update the iterator so that the next call to {@link #next()} will return the first node is not a regex element that can match against the character '0', + * starting from the iterator's current position in the list. If no such node is found, the iterator will be moved to the end of the list, + * {@link #hasNext()} will return false and any call to {@link #next()} will result in a {@link NoSuchElementException}. + */ + public void seekPastZeroMatchingElements() { + while (hasNext()) { + // Peek at the next node. + Node next = peekNext(); + // We have a leading zero. Skip it. + if (RegexUtils.matchesZero(next)) { + // Explicitly call next so that we increment the iterator index. + next(); + // Seek past a succeeding quantifier and question mark if present. + seekPastQuantifiers(); + seekPastQuestionMarks(); + } else { + return; + } + } + } + + /** + * Update the iterator so that the next call to {@link #next()} will return the first node is not a regex element that can match only the character '0', + * starting from the iterator's current position in the list. If no such node is found, the iterator will be moved to the end of the list, + * {@link #hasNext()} will return false and any call to {@link #next()} will result in a {@link NoSuchElementException}. + */ + public void seekPastZeroOnlyElements() { + while (hasNext()) { + // Peek at the next node. + Node next = peekNext(); + // We have a leading zero. Skip it. + if (RegexUtils.matchesZeroOnly(next)) { + // Explicitly call next so that we increment the iterator index. + next(); + // Seek past a succeeding quantifier and question mark if present. + seekPastQuantifiers(); + seekPastQuestionMarks(); + } else { + return; + } + } + } + + /** + * Update the iterator so that the next call to {@link #next()} will return the first node that is not a {@link ZeroOrMoreNode}, {@link OneOrMoreNode}, or + * {@link RepetitionNode}, starting from the iterator's current position in the list. If no such node is found, the iterator will be moved to the end of the + * list, {@link #hasNext()} will return false and any call to {@link #next()} will result in a {@link NoSuchElementException}. + */ + public void seekPastQuantifiers() { + while (isNextQuantifier()) { + next(); + } + } + + /** + * Update the iterator so that the next call to {@link #next()} will return the first node that is not an {@link QuestionMarkNode}, starting from the + * iterator's current position in the list. If no such node is found, the iterator will be moved to the end of the list, {@link #hasNext()} will return + * false and any call to {@link #next()} will result in a {@link NoSuchElementException}. + */ + public void seekPastQuestionMarks() { + while (isNextQuestionMark()) { + next(); + } + } + + /** + * Return whether the next node in the list is a {@link ZeroOrMoreNode}, {@link OneOrMoreNode}, or a {@link RepetitionNode}. + * + * @return true if the next node in the list is a quantifier type, or false otherwise + */ + public boolean isNextQuantifier() { + return hasNext() && isNextInstanceOfAny(RegexConstants.QUANTIFIER_TYPES); + } + + /** + * Return whether the next node in the list is a {@link QuestionMarkNode}. + * + * @return true if the next node in the list is an {@link QuestionMarkNode}, or false otherwise + */ + public boolean isNextQuestionMark() { + return hasNext() && isNextInstanceOf(QuestionMarkNode.class); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/NodeType.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/NodeType.java new file mode 100644 index 00000000000..f72670789b2 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/NodeType.java @@ -0,0 +1,25 @@ +package datawave.data.normalizer.regex; + +public enum NodeType { + + ALTERNATION, + ANY_CHAR, + CHAR_CLASS, + CHAR_RANGE, + DIGIT_CHAR_CLASS, + EMPTY, + END_ANCHOR, + ESCAPED_SINGLE_CHAR, + EXPRESSION, + GROUP, + INTEGER, + INTEGER_RANGE, + ONE_OR_MORE, + OPTIONAL, + REPETITION, + SINGLE_CHAR, + START_ANCHOR, + ZERO_OR_MORE, + ENCODED_NUMBER, + ENCODED_PATTERN +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/NumericRegexEncoder.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/NumericRegexEncoder.java new file mode 100644 index 00000000000..4e0e885417c --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/NumericRegexEncoder.java @@ -0,0 +1,462 @@ +package datawave.data.normalizer.regex; + +import java.util.function.Function; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import org.apache.log4j.Logger; + +import com.google.common.base.CharMatcher; + +import datawave.data.normalizer.ZeroRegexStatus; +import datawave.data.normalizer.regex.visitor.AlternationDeduper; +import datawave.data.normalizer.regex.visitor.AnchorTrimmer; +import datawave.data.normalizer.regex.visitor.DecimalPointPlacer; +import datawave.data.normalizer.regex.visitor.DecimalPointValidator; +import datawave.data.normalizer.regex.visitor.EmptyLeafTrimmer; +import datawave.data.normalizer.regex.visitor.ExponentialBinAdder; +import datawave.data.normalizer.regex.visitor.NegativeNumberPatternInverter; +import datawave.data.normalizer.regex.visitor.NegativeVariantExpander; +import datawave.data.normalizer.regex.visitor.NonEncodedNumbersChecker; +import datawave.data.normalizer.regex.visitor.NumericCharClassValidator; +import datawave.data.normalizer.regex.visitor.OptionalVariantExpander; +import datawave.data.normalizer.regex.visitor.PrintVisitor; +import datawave.data.normalizer.regex.visitor.SimpleNumberEncoder; +import datawave.data.normalizer.regex.visitor.StringVisitor; +import datawave.data.normalizer.regex.visitor.ZeroLengthRepetitionTrimmer; +import datawave.data.normalizer.regex.visitor.ZeroTrimmer; +import datawave.data.normalizer.regex.visitor.ZeroValueNormalizer; +import datawave.data.type.util.NumericalEncoder; + +/** + * This class handles provides functionality for encoding numeric regexes that are meant to match against numbers that were previously encoded via + * {@link NumericalEncoder#encode(String)}. It is expected that incoming regexes are initially written to match against base ten numbers. Due to the complex + * nature of how numbers are encoded and trimmed, accuracy is NOT guaranteed when using this class to encode numeric regexes. + *

+ *

+ * Requirements + *

+ * The following requirements apply to all incoming regexes: + *

    + *
  • Patterns may not be blank.
  • + *
  • Patterns may not contain whitespace.
  • + *
  • Patterns must be compilable.
  • + *
  • Patterns may not contain any letters other than {@code "\d"}.
  • + *
  • Patterns may not contain any escaped characters other than {@code "\."}, {@code "\-"}, or {@code "\d"}.
  • + *
  • Patterns may not contain any groups, e.g. {@code "(45.*)"}.
  • + *
  • Patterns may not contain any decimal points that are followed by {@code ?} {@code *} {@code +} or a repetition quantifier such as {@code {3}}.
  • + *
+ *

+ *

+ * Supported Regex Features + *

+ * The following regex features are supported, with any noted caveats. + *

    + *
  • Wildcards {@code "."}.
  • + *
  • Digit character class {@code "\d"}.
  • + *
  • Character class lists {@code "[]"}. CAVEAT: Digit characters only. Ranges are supported.
  • + *
  • Zero or more quantifier {@code "*"}.
  • + *
  • One or more quantifier {@code "+"}.
  • + *
  • Repetition quantifier {@code "{x}"}, {@code "{x,}"}, and {@code "{x,y}"}.
  • + *
  • Anchors {@code "^"} and {@code "$"}. CAVEAT: Technically not truly supported as they are ultimately removed during the pre-optimization process. However, + * using them will not result in an error.
  • + *
  • Alternations {@code "|"}.
  • + *
+ * Additionally, in order to mark a regex pattern as intended to match negative numbers only, a minus sign should be placed at the beginning of the regex + * pattern, e.g. {@code "-34.*"}, or at the beginning of each desired alternated pattern. + *

+ *

+ * Optimizations + *

+ * Before encoding the incoming regex, it will undergo the following modifications to optimize the ease of encoding: + *

    + *
  1. Any empty alternations will be removed.
  2. + *
  3. Any occurrences of the anchors {@code ^} or {@code $} will be removed. These will need to be added back into the returned encoded regex pattern + * afterwards if desired.
  4. + *
  5. Optional variants (characters followed by {@code ?}} will be expanded into additional alternations as seen. This will not apply to any {@code ?} + * instances that directly follow a {@code *}, {@code +}, or {@code {x}}, as the {@code ?} in this case modifies the greediness of the matching rather than + * whether or not a character can be present.
  6. + *
  7. Any characters immediately followed by the repetition quantifier {@code "{0}"} or {@code "{0,0}"} will be removed as they are expected to occur zero + * times. This does not apply to characters with the repetition quantifier {@code "{0,}"} or a variation of {@code "{0,x}"}.
  8. + *
  9. Any patterns starting with {@code ".*"} or {@code ".+"} will result in the addition of an alternation of the same pattern with a minus sign in front of + * it to ensure a variant for matching negative numbers is added. This does not apply to any regex patterns already starting with {@code "-.*"} or + * {@code "-.+"}.
  10. + *
  11. In some cases a pattern may match both exactly zero and another number greater than one, e.g. the pattern "[0-9].*". In this case, an alternation for the + * character {@code "0"} will be added (i.e. {@code "[0-9].*|0"}) to ensure that the ability to match zero is not lost when enriching the pattern with the + * required exponential bins to target the appropriate encoded numbers.
  12. + *
  13. Pattern alternations will be de-duped.
  14. + *
+ *

+ *

+ * A strong effort has been made to make resulting encoded patterns as accurate as possible, but there is always a chance of at least some inaccuracy, given the + * nature of how numbers are encoded, particularly when it comes to numbers that are very similar other than the location of a decimal point, if present, in + * them. If you find that the resulting encoded regex is not matching the desired encoding numbers, try to simplify it into a higher number of alternations with + * simpler regexes if possible. + * + * @see NumericalEncoder + */ +public class NumericRegexEncoder { + + private static final Logger log = Logger.getLogger(NumericRegexEncoder.class); + + /** + * Matches against any unescaped d characters, and any other letters. If \d is present, that indicates a digit and is allowed. + */ + private static final Pattern RESTRICTED_LETTERS_PATTERN = Pattern.compile(".*[a-ce-zA-Z].*"); + + /** + * Matches any escaped character that is not \. \- or \d. + */ + private static final Pattern RESTRICTED_ESCAPED_CHARS_PATTERN = Pattern.compile(".*\\\\[^.d\\-].*"); + + /** + * Matches any regex that consists only of anchors, hyphens (escaped or not), escaped periods, repetitions, the quantifier *, the quantifier +, optionals, + * alternations, and groups in any order with no alphanumeric characters that give any meaningful numeric information. + */ + private static final Pattern NONSENSE_PATTERN = Pattern.compile("^\\^?(\\(*(\\\\\\.)*\\)*|(\\(*\\\\?[\\-*+?|])*\\)*|(\\{.*}))*\\$?$"); + + /** + * Matches any decimal points with ? + * or a repetition quantifier directly following them. + */ + private static final Pattern INVALID_DECIMAL_POINTS_PATTERN = Pattern.compile(".*\\\\\\.[?+*{].*"); + + /** + * Matches against any variation of {@code .*}, {@code .+}, {@code .*?}, {@code .+?} that may or may not repeat, and that may or may not contain start + * and/or end anchors. + */ + private static final Pattern NORMALIZATION_NOT_REQUIRED_PATTERN = Pattern.compile("^\\^?(\\.[*+]\\??)+\\$?$"); + + /** + * Encode the given numeric regex pattern such that it will match against encoded numbers. + * + * @param regex + * the regex pattern + * @return the encoded regex pattern + */ + public static String encode(String regex) { + return new NumericRegexEncoder(regex).encode(); + } + + private final String pattern; + private Node patternTree; + + private NumericRegexEncoder(String pattern) { + this.pattern = pattern; + } + + public static ZeroRegexStatus getZeroRegexStatus(String regex) { + return ZeroTrimmer.getStatus(RegexParser.parse(regex).getChildren()); + } + + private String encode() { + if (log.isDebugEnabled()) { + log.debug("Encoding pattern " + pattern); + } + + // Check the pattern for any quick failures. + checkPatternForQuickFailures(); + // Encode the pattern only if it requires it. + if (isEncodingRequired()) { + parsePatternTree(); + normalizePatternTree(); + encodePatternTree(); + + if (log.isDebugEnabled()) { + log.debug("Encoded pattern '" + pattern + "' to '" + StringVisitor.toString(this.patternTree) + "'"); + } + + return StringVisitor.toString(this.patternTree); + } else { + if (log.isDebugEnabled()) { + log.debug("Encoding not required for pattern '" + pattern + "'"); + } + return this.pattern; + } + } + + /** + * Pre-validate the regex to quickly identify any indications that the regex is not valid for numerical expansion. + */ + private void checkPatternForQuickFailures() { + checkForBlankPattern(); + checkForWhitespace(); + checkForCompilation(); + checkForNonsense(); + checkForRestrictedLetters(); + checkForRestrictedEscapedCharacters(); + checkForGroups(); + checkForQuantifiedDecimalPoints(); + } + + /** + * Throws an exception if the regex pattern is blank. + */ + private void checkForBlankPattern() { + if (this.pattern.isEmpty()) { + throw new IllegalArgumentException("Regex pattern may not be blank."); + } + } + + /** + * Throws an exception if the regex contains any whitespace. + */ + private void checkForWhitespace() { + if (CharMatcher.whitespace().matchesAnyOf(pattern)) { + throw new IllegalArgumentException("Regex pattern may not contain any whitespace."); + } + } + + /** + * Throws an exception if the regex cannot be compiled. + */ + private void checkForCompilation() { + try { + Pattern.compile(this.pattern); + } catch (PatternSyntaxException e) { + throw new IllegalArgumentException("Regex pattern will not compile.", e); + } + } + + private void checkForNonsense() { + if (NONSENSE_PATTERN.matcher(this.pattern).matches()) { + throw new IllegalArgumentException("A nonsense pattern has been given that cannot be normalized."); + } + } + + /** + * Throws an exception if the regex contains any letter other than an escaped lowercase d. + */ + private void checkForRestrictedLetters() { + if (RESTRICTED_LETTERS_PATTERN.matcher(pattern).matches() || containsUnescapedLowercaseD()) { + throw new IllegalArgumentException( + "Regex pattern may not contain any letters other than \\d to indicate a member of the digit character class 0-9."); + } + } + + /** + * Return whether the regex contains an unescaped d. + */ + private boolean containsUnescapedLowercaseD() { + int pos = pattern.indexOf(RegexConstants.LOWERCASE_D); + while (pos != -1) { + if (pos == 0 || pattern.charAt(pos - 1) != RegexConstants.BACKSLASH) { + return true; + } + pos = pattern.indexOf(RegexConstants.LOWERCASE_D, pos + 1); + } + return false; + } + + /** + * Throws an exception if the regex contains any escaped characters other than {@code \.}, {@code \-} or {@code \d}. + */ + private void checkForRestrictedEscapedCharacters() { + if (RESTRICTED_ESCAPED_CHARS_PATTERN.matcher(this.pattern).matches()) { + throw new IllegalArgumentException("Regex pattern may not contain any escaped characters other than \\. \\- or \\d."); + } + } + + /** + * Throws an exception if the regex contains any occurrences of '(' indicating the start of a group. + */ + private void checkForGroups() { + if (this.pattern.contains("(")) { + throw new IllegalArgumentException("Regex pattern may not contain any groups."); + } + } + + /** + * Throws an exception if the regex contains any decimal points directly followed by * + or {}. + */ + private void checkForQuantifiedDecimalPoints() { + if (INVALID_DECIMAL_POINTS_PATTERN.matcher(this.pattern).matches()) { + throw new IllegalArgumentException("Regex pattern may not contain any decimal points that are directly followed by * ? or {}."); + } + } + + /** + * Returns whether the regex requires normalization. + * + * @return true if the regex requires normalization, or false otherwise. + */ + private boolean isEncodingRequired() { + return !NORMALIZATION_NOT_REQUIRED_PATTERN.matcher(this.pattern).matches(); + } + + /** + * Parse the regex to a node tree. + */ + private void parsePatternTree() { + parsePatternToTree(); + validateCharClasses(); + validateDecimalPoints(); + } + + /** + * Normalize the pattern tree. + */ + private void normalizePatternTree() { + trimAnchors(); + trimZeroLengthRepetitions(); + trimEmptyLeafs(); + expandOptionalVariants(); + expandNegativeVariants(); + expandZeroValues(); + } + + /** + * Encode the pattern tree. + */ + private void encodePatternTree() { + dedupe(); + encodeSimpleNumbers(); + // If there are no more unencoded sub-patterns in the tree after encoding simple numbers, no further work needs to be done. + if (!moreToEncode()) { + return; + } + addExponentialBins(); + trimZeros(); + invertNegativePatterns(); + addDecimalPoints(); + dedupe(); + } + + /** + * Parse the pattern to a node tree. + */ + private void parsePatternToTree() { + this.patternTree = RegexParser.parse(this.pattern); + + if (log.isDebugEnabled()) { + log.debug("Parsed pattern to tree structure:\n" + PrintVisitor.printToString(this.patternTree)); + } + } + + /** + * Verify that the regex pattern does not contain any character classes with characters other than digits or a period. + */ + private void validateCharClasses() { + NumericCharClassValidator.validate(this.patternTree); + + if (log.isDebugEnabled()) { + log.debug("Validated character classes in regex"); + } + } + + /** + * Verify that the regex pattern does not contain any alternated expressions that have more than one required decimal point. + */ + private void validateDecimalPoints() { + DecimalPointValidator.validate(this.patternTree); + + if (log.isDebugEnabled()) { + log.debug("Validated decimal points classes in regex"); + } + } + + /** + * Trim all anchors. + */ + private void trimAnchors() { + updatePatternTree(AnchorTrimmer::trim, "trimming anchors"); + } + + /** + * Trim all elements that occur exactly zero times. + */ + private void trimZeroLengthRepetitions() { + updatePatternTree(ZeroLengthRepetitionTrimmer::trim, "trimming zero-length repetition characters"); + + // If the pattern is empty afterwards, throw an exception. + if (this.patternTree == null) { + throw new IllegalArgumentException("Regex pattern is empty after trimming all characters followed by {0} or {0,0}."); + } + } + + /** + * Trim the tree of any empty nodes and empty alternations, and verify if we still have a pattern to encode. + */ + private void trimEmptyLeafs() { + updatePatternTree(EmptyLeafTrimmer::trim, "trimming empty leafs"); + } + + /** + * Expand optional variants. + */ + private void expandOptionalVariants() { + updatePatternTree(OptionalVariantExpander::expand, "expanding optional variants"); + } + + /** + * Expand any patterns beginning with {@code .} to include a version with a minus sign in front of it. + */ + private void expandNegativeVariants() { + updatePatternTree(NegativeVariantExpander::expand, "expanding negative variants"); + } + + /** + * If any patterns can match the number '0', add an alternation with '0'. + */ + private void expandZeroValues() { + updatePatternTree(ZeroValueNormalizer::expand, "normalizing zero-value characters"); + } + + /** + * Remove any duplicate alternations. + */ + private void dedupe() { + updatePatternTree(AlternationDeduper::dedupe, "de-duping"); + } + + /** + * Encode any and all simple numbers present in the pattern. + */ + private void encodeSimpleNumbers() { + updatePatternTree(SimpleNumberEncoder::encode, "encoding simple numbers"); + } + + /** + * Return whether there are unencoded sub-patterns in the tree after encoding simple numbers. + * + * @return true if there are more patterns to encode, or false otherwise + */ + private boolean moreToEncode() { + return NonEncodedNumbersChecker.check(this.patternTree); + } + + /** + * Add exponential bin range information, e.g. \+[a-z], ![A-Z], etc. + */ + private void addExponentialBins() { + updatePatternTree(ExponentialBinAdder::addBins, "adding exponential bin information"); + } + + /** + * Trim/consolidate any leading zeros in partially-encoded patterns. + */ + private void trimZeros() { + updatePatternTree(ZeroTrimmer::trim, "trimming leading/trailing zeros"); + } + + /** + * Invert any patterns that are meant to match negative numbers. + */ + private void invertNegativePatterns() { + updatePatternTree(NegativeNumberPatternInverter::invert, "inverting patterns for negative numbers"); + } + + /** + * Add decimal points where required. + */ + private void addDecimalPoints() { + updatePatternTree(DecimalPointPlacer::addDecimalPoints, "adding decimal points"); + } + + private void updatePatternTree(Function function, String operationDescription) { + this.patternTree = function.apply(this.patternTree); + + if (log.isDebugEnabled()) { + log.debug("Regex after " + operationDescription + ": " + StringVisitor.toString(this.patternTree)); + } + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/OneOrMoreNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/OneOrMoreNode.java new file mode 100644 index 00000000000..1876642b856 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/OneOrMoreNode.java @@ -0,0 +1,32 @@ +package datawave.data.normalizer.regex; + +import java.util.Map; + +import datawave.data.normalizer.regex.visitor.Visitor; + +/** + * Represents the plus sign in a regex pattern. + */ +public class OneOrMoreNode extends Node { + + public OneOrMoreNode() {} + + public OneOrMoreNode(Map properties) { + super(properties); + } + + @Override + public NodeType getType() { + return NodeType.ONE_OR_MORE; + } + + @Override + public Object accept(Visitor visitor, Object data) { + return visitor.visitOneToMany(this, data); + } + + @Override + public OneOrMoreNode shallowCopy() { + return new OneOrMoreNode(this.properties); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/QuestionMarkNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/QuestionMarkNode.java new file mode 100644 index 00000000000..0dd34dbefa7 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/QuestionMarkNode.java @@ -0,0 +1,32 @@ +package datawave.data.normalizer.regex; + +import java.util.Map; + +import datawave.data.normalizer.regex.visitor.Visitor; + +/** + * Represents the question mark in a regex pattern. + */ +public class QuestionMarkNode extends Node { + + public QuestionMarkNode() {} + + public QuestionMarkNode(Map properties) { + super(properties); + } + + @Override + public NodeType getType() { + return NodeType.OPTIONAL; + } + + @Override + public Object accept(Visitor visitor, Object data) { + return visitor.visitQuestionMark(this, data); + } + + @Override + public QuestionMarkNode shallowCopy() { + return new QuestionMarkNode(this.properties); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexConstants.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexConstants.java new file mode 100644 index 00000000000..df1ef9ee1c2 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexConstants.java @@ -0,0 +1,65 @@ +package datawave.data.normalizer.regex; + +import java.util.List; +import java.util.Set; +import java.util.regex.Pattern; + +import com.google.common.base.Joiner; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; + +public class RegexConstants { + + public static final char ZERO = '0'; + public static final char ONE = '1'; + public static final char TWO = '2'; + public static final char THREE = '3'; + public static final char FOUR = '4'; + public static final char FIVE = '5'; + public static final char SIX = '6'; + public static final char SEVEN = '7'; + public static final char EIGHT = '8'; + public static final char NINE = '9'; + public static final char LOWERCASE_D = 'd'; + public static final char BACKSLASH = '\\'; + public static final char PERIOD = '.'; + public static final char HYPHEN = '-'; + public static final char STAR = '*'; + public static final char PLUS = '+'; + public static final char PIPE = '|'; + public static final char LEFT_PAREN = '('; + public static final char RIGHT_PAREN = ')'; + public static final char LEFT_BRACKET = '['; + public static final char RIGHT_BRACKET = ']'; + public static final char EXCLAMATION_POINT = '!'; + public static final char LEFT_BRACE = '{'; + public static final char RIGHT_BRACE = '}'; + public static final char QUESTION_MARK = '?'; + public static final char COMMA = ','; + public static final char CARET = '^'; + public static final char DOLLAR_SIGN = '$'; + public static final char CAPITAL_E = 'E'; + + public static final String ESCAPED_BACKSLASH = "\\\\"; + + /** + * Use base 10 when parsing characters to ints. + */ + public static final int DECIMAL_RADIX = 10; + + /** + * The set of all digits. This reflects all possible permutations for any \d found in the regex. + */ + public static final List ALL_DIGITS = ImmutableList.of(ZERO, ONE, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE); + + public static final Set> QUANTIFIER_TYPES = ImmutableSet.of(ZeroOrMoreNode.class, OneOrMoreNode.class, RepetitionNode.class); + + public static final Set> SIMPLE_NUMBER_TYPES = ImmutableSet.of(SingleCharNode.class, EscapedSingleCharNode.class, + StartAnchorNode.class, EndAnchorNode.class); + + public static final Pattern SIMPLE_NUMBER_REGEX_PATTERN = Pattern.compile("^\\^?(\\\\?-)?\\d*(\\\\\\.)?\\d+\\$?$"); + + private RegexConstants() { + throw new UnsupportedOperationException(); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexParser.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexParser.java new file mode 100644 index 00000000000..b1077b1d45b --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexParser.java @@ -0,0 +1,305 @@ +package datawave.data.normalizer.regex; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.lang3.StringUtils; + +/** + * This parser will create a {@link Node} tree parsed from a regex pattern. This parser will be used for normalizing numeric regex patterns, and as such is not + * intended to be a fully comprehensive regex parser. Some native regex characters may be restricted. + */ +public class RegexParser { + + /** + * Parses the given regex and returns a {@link ExpressionNode} tree representing the parsed regex. If the string is null, null will be returned. + * + * @param regex + * the regex to parse + * @return the {@link Node} tree + */ + public static ExpressionNode parse(String regex) { + if (regex == null) { + return null; + } + Node node = parseAlternations(regex); + // Ensure the root node is always an expression node. + return node instanceof ExpressionNode ? (ExpressionNode) node : createExpressionWithChild(node); + } + + /** + * Parses a regex expression from the given string that may contain alternations. Depending on the expression, one of the following will be returned: + *

    + *
  • An {@link EmptyNode} will be returned if a blank string is given.
  • + *
  • If the expression contains top-level alternations, an {@link ExpressionNode} with an {@link AlternationNode} as its child with its child alternating + * expressions will be returned.
  • + *
  • If the expression does not contain any top-level alternations, an {@link ExpressionNode} with the parsed expression as its children will be + * returned.
  • + *
+ * + * @param string + * the string + * @return the parsed node + */ + private static Node parseAlternations(String string) { + // If the string is blank, return an EmptyNode. + if (StringUtils.isBlank(string)) { + return new EmptyNode(); + } + + List expressions = RegexUtils.splitOnAlternations(string); + Node node; + if (expressions.size() > 1) { + // If we have more than one expression, we must make the parsed expressions children of an alternation node. + node = new AlternationNode(); + for (String segment : expressions) { + Node child = parseAlternations(segment); + if (child != null) { + node.addChild(child); + } + } + } else if (expressions.size() == 1) { + node = parseExpression(expressions.get(0)); + } else { + return null; + } + // If the parsed node is not an AlternationNode, GroupNode, or ExpressionNode, wrap it in an ExpressionNode. + return requiresWrap(node) ? createExpressionWithChild(node) : node; + } + + /** + * Parses a subset of a regex expression that does not contain any top-level alternations, i.e. pipes. + * + * @param string + * the regex to parse + * @return the parsed node + */ + private static Node parseExpression(String string) { + + // If the string is blank, return an EmptyNode. + if (StringUtils.isBlank(string)) { + return new EmptyNode(); + } + + List nodes = new ArrayList<>(); + RegexReader reader = new RegexReader(string); + while (reader.hasNext()) { + reader.captureNext(); + RegexReader.ExpressionType type = reader.capturedType(); + String content = reader.capturedExpression(); + nodes.add(createNode(type, content)); + } + + // If we have a single child parsed from the expression, wrap it in an expression node if it is not already a wrapper node. Otherwise, return the child. + if (nodes.size() == 1) { + Node child = nodes.get(0); + return requiresWrap(child) ? createExpressionWithChild(child) : child; + } else { + // Wrap the children in an expression node. + ExpressionNode expressionNode = new ExpressionNode(); + expressionNode.setChildren(nodes); + return expressionNode; + } + } + + /** + * Return a new {@link ExpressionNode} with the given node as its child. + * + * @param child + * the child + * @return the new node + */ + private static ExpressionNode createExpressionWithChild(Node child) { + ExpressionNode node = new ExpressionNode(); + node.addChild(child); + return node; + } + + /** + * Return whether the given node should be wrapped in an {@link ExpressionNode}. A node should not be wrapped if it is an instance of one of the following: + *
    + *
  • {@link ExpressionNode}
  • + *
  • {@link GroupNode}
  • + *
  • {@link AlternationNode}
  • + *
+ * + * @param node + * the node + * @return true if the given node is a wrapper type, or false otherwise. + */ + private static boolean requiresWrap(Node node) { + return node != null && !(node instanceof ExpressionNode || node instanceof AlternationNode || node instanceof GroupNode); + } + + /** + * Return a new node of the specified type with the given content if applicable. + * + * @param type + * the node type to create + * @param content + * the content + * @return the new node + */ + private static Node createNode(RegexReader.ExpressionType type, String content) { + switch (type) { + case ANCHOR_START: + return new StartAnchorNode(); + case ANCHOR_END: + return new EndAnchorNode(); + case ESCAPED_CHAR: + return createNodeFromEscapedChar(content); + case ANY_CHAR: + return new AnyCharNode(); + case ZERO_OR_MORE: + return new ZeroOrMoreNode(); + case ONE_OR_MORE: + return new OneOrMoreNode(); + case QUESTION_MARK: + return new QuestionMarkNode(); + case SINGLE_CHAR: + return new SingleCharNode(content.charAt(0)); + case REPETITION: + return createRepetitionNode(content); + case CHAR_CLASS: + return createCharClassNode(content); + case GROUP: + return createGroupNode(content); + default: + throw new IllegalArgumentException("Unable to create new node of type " + type); + } + } + + /** + * Return a new {@link Node} from the given escaped character. In the case of {@code \d}, a new {@link DigitCharClassNode} will be returned. Otherwise, a + * new {@link EscapedSingleCharNode} with the character will be returned. + * + * @param content + * the content + * @return the new node + */ + private static Node createNodeFromEscapedChar(String content) { + char character = content.charAt(1); + if (character == RegexConstants.LOWERCASE_D) { + return new DigitCharClassNode(); + } + return new EscapedSingleCharNode(character); + } + + /** + * Return a new {@link RepetitionNode} parsed from the given expression. It is expected that the given content is an interval expression in the form + * {@code {x}}, {@code {x,y}}, {@code {x,}}, or {@code {,y}}. + * + * @param expression + * the interval expression + * @return the node + */ + private static RepetitionNode createRepetitionNode(String expression) { + RepetitionNode node = new RepetitionNode(); + int commaIndex = expression.indexOf(RegexConstants.COMMA); + if (commaIndex == -1) { + // If no comma is present, the interval expression is in the form {x}. Remove the curly braces and parse the number from x. + node.addChild(new IntegerNode(Integer.parseInt(trimFirstAndLastChar(expression)))); + } else { + // If a comma is present, the interval expression is in the form {x,y} or {x,}. Remove the curly braces and parse the range from x and y. + int start = Integer.parseInt(expression.substring(1, commaIndex)); + Integer end = commaIndex == (expression.length() - 2) ? null : Integer.parseInt(expression.substring((commaIndex + 1), (expression.length() - 1))); + node.addChild(new IntegerRangeNode(start, end)); + } + return node; + } + + /** + * Return a new {@link CharClassNode} parsed from the given expression. Parsing negated character classes is supported. The character class may only contain + * the following: digits, a period, a hyphen, a numerical range. + * + * @param expression + * the character class expression + * @return the node + */ + private static CharClassNode createCharClassNode(String expression) { + CharClassNode node = new CharClassNode(); + char[] chars = expression.toCharArray(); + char next; + for (int pos = 1; pos < (chars.length - 1); pos++) { + char current = chars[pos]; + switch (current) { + case RegexConstants.HYPHEN: + // We found a hyphen at the start or end of the character class, e.g. [-123] or [123-]. Hyphens do not need to be escaped in these cases. + node.addChild(new SingleCharNode(current)); + break; + case RegexConstants.BACKSLASH: + // We found an escaped character. + next = chars[(pos) + 1]; + node.addChild(new EscapedSingleCharNode(next)); + pos++; + break; + case RegexConstants.CARET: + // If the caret is the first character in the class, we have a negated character class, e.g. [^123]. + if (pos == 1) { + node.negate(); + } else { + // Otherwise add it as a single character. + node.addChild(new SingleCharNode(current)); + } + break; + default: + // Check if we have a non-trailing hyphen that indicates a defined character range. + next = chars[(pos + 1)]; + if (next == RegexConstants.HYPHEN) { + char charAfterNext = chars[(pos) + 2]; + // If the next character is not a closing bracket, we have a character range. Otherwise, the hyphen will need to be captured as its own + // single character in an earlier switch case, + if (charAfterNext != RegexConstants.RIGHT_BRACKET) { + node.addChild(new CharRangeNode(current, charAfterNext)); + // Move to the next character after the range. + pos = pos + 2; + } + } else { + // Otherwise, add the current character as a single character. + node.addChild(new SingleCharNode(current)); + } + break; + } + } + return node; + } + + /** + * Return a new {@link GroupNode} parsed from the given expression. + * + * @param expression + * the group expression + * @return the node + */ + private static GroupNode createGroupNode(String expression) { + String subExpression = trimFirstAndLastChar(expression); + GroupNode groupNode = new GroupNode(); + Node node = parseAlternations(subExpression); + if (node != null) { + groupNode.addChild(node); + } + return groupNode; + } + + /** + * Return the given string with the first and last character trimmed. If the string has a length less than 3, an empty string will be returned. + * + * @param str + * the string + * @return the trimmed string + */ + private static String trimFirstAndLastChar(String str) { + if (str.length() < 3) { + return ""; + } else { + return str.substring(1, (str.length() - 1)); + } + } + + /** + * Do not allow this class to be instantiated. + */ + private RegexParser() { + throw new UnsupportedOperationException(); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexReader.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexReader.java new file mode 100644 index 00000000000..b3b20df7af1 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexReader.java @@ -0,0 +1,246 @@ +package datawave.data.normalizer.regex; + +import java.util.Arrays; + +import com.google.common.base.Preconditions; + +/** + * A reader that traverses over a regex pattern and both identifies and steps through individual regex elements. + */ +class RegexReader { + + public enum ExpressionType { + GROUP, ALTERNATION, REPETITION, CHAR_CLASS, SINGLE_CHAR, ESCAPED_CHAR, ANY_CHAR, ZERO_OR_MORE, ONE_OR_MORE, QUESTION_MARK, ANCHOR_START, ANCHOR_END + } + + /** + * The original char array of the pattern. + */ + private final char[] pattern; + + /** + * Index into the pattern array that keeps track of how much has been read. + */ + private int cursor = 0; + + /** + * The type of the most recently read regex expression. + */ + private ExpressionType capturedType; + + /** + * The content of the most recently read regex expression. + */ + private String capturedContent; + + /** + * Create a new {@link RegexReader} that will read over the given regex pattern. + * + * @param pattern + * the regex pattern to read over + */ + public RegexReader(String pattern) { + Preconditions.checkNotNull(pattern, "regex must not be null"); + this.pattern = pattern.toCharArray(); + } + + /** + * Return whether there is another expression to capture in this reader. + * + * @return true if there is another expression, or false otherwise + */ + public boolean hasNext() { + return cursor < pattern.length; + } + + /** + * Return the {@link ExpressionType} identified for the next expression during the last call to {@link #captureNext()}, or null if {@link #captureNext()} + * has never been called. + * + * @return the captured type + */ + public ExpressionType capturedType() { + return capturedType; + } + + /** + * Return the string content identified for the next expression during the last call to {@link #captureNext()}, or null if {@link #captureNext()} has never + * been called. + * + * @return the captured string expression + */ + public String capturedExpression() { + return capturedContent; + } + + /** + * Identify and capture the regex node type and content of the next expression in this reader. + * + * @throws IllegalStateException + * if {@link #hasNext()} returns false + */ + public void captureNext() { + if (hasNext()) { + identifyCurrentType(); + int startOfCapture = cursor; + skipPastCurrentExpression(); + this.capturedContent = new String(Arrays.copyOfRange(pattern, startOfCapture, cursor)); + } else { + throw new IllegalStateException("Reader does not have next to capture"); + } + } + + /** + * Identify the type of the current expression starting at the current cursor point. + */ + private void identifyCurrentType() { + char current = current(); + switch (current) { + case RegexConstants.PIPE: + this.capturedType = ExpressionType.ALTERNATION; + break; + case RegexConstants.LEFT_PAREN: + this.capturedType = ExpressionType.GROUP; + break; + case RegexConstants.LEFT_BRACE: + this.capturedType = ExpressionType.REPETITION; + break; + case RegexConstants.LEFT_BRACKET: + this.capturedType = ExpressionType.CHAR_CLASS; + break; + case RegexConstants.CARET: + this.capturedType = ExpressionType.ANCHOR_START; + break; + case RegexConstants.DOLLAR_SIGN: + this.capturedType = ExpressionType.ANCHOR_END; + break; + case RegexConstants.PERIOD: + this.capturedType = ExpressionType.ANY_CHAR; + break; + case RegexConstants.STAR: + this.capturedType = ExpressionType.ZERO_OR_MORE; + break; + case RegexConstants.PLUS: + this.capturedType = ExpressionType.ONE_OR_MORE; + break; + case RegexConstants.QUESTION_MARK: + this.capturedType = ExpressionType.QUESTION_MARK; + break; + case RegexConstants.BACKSLASH: + this.capturedType = ExpressionType.ESCAPED_CHAR; + break; + default: + this.capturedType = ExpressionType.SINGLE_CHAR; + } + } + + /** + * Return the character in the chars array at the current cursor index. + * + * @return the current character + */ + private char current() { + return pattern[cursor]; + } + + /** + * Increments the cursor by one and returns the next character in the char array. + * + * @return the next character + */ + private char next() { + return pattern[++cursor]; + } + + /** + * Increment the cursor by one. + */ + private void skip() { + cursor++; + } + + /** + * Increment the cursor by the given number of skips. + * + * @param skips + * the skips to increment by + */ + private void skip(int skips) { + cursor = cursor + skips; + } + + /** + * Increment the cursor to point to the position after the current expression based on the current captured type. + */ + private void skipPastCurrentExpression() { + switch (capturedType) { + case SINGLE_CHAR: + case ALTERNATION: + case ANY_CHAR: + case ZERO_OR_MORE: + case ONE_OR_MORE: + case QUESTION_MARK: + case ANCHOR_START: + case ANCHOR_END: + skip(1); + break; + case ESCAPED_CHAR: + skip(2); + break; + case CHAR_CLASS: + skipPastChar(RegexConstants.RIGHT_BRACKET); + break; + case REPETITION: + skipPastChar(RegexConstants.RIGHT_BRACE); + break; + case GROUP: + skipPastGroup(); + break; + default: + throw new IllegalArgumentException("Unable to seek past type " + capturedType); + } + } + + /** + * Increment the cursor to point to the position after the first occurrence of the given character. + * + * @param character + * the character to skip past + */ + private void skipPastChar(char character) { + while (hasNext()) { + char next = next(); + if (next == character) { + skip(); + return; + } + } + } + + /** + * Increment the cursor to point to the position after the current group expression. This method will handle nested groups. + */ + private void skipPastGroup() { + int nestedGroups = 0; + while (hasNext()) { + char next = next(); + switch (next) { + case RegexConstants.RIGHT_PAREN: + // If there are no nested groups, we've found the end of the target group. Skip ahead to the next character after it. + if (nestedGroups == 0) { + skip(); + return; + } else { + // We've traversed to the end of a nested group. + nestedGroups--; + } + break; + case RegexConstants.LEFT_PAREN: + // If we encounter a ( before the first ) we see, we've found a nested group and must traverse to the end it. + nestedGroups++; + break; + default: + } + } + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexUtils.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexUtils.java new file mode 100644 index 00000000000..2fdd487e3d5 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexUtils.java @@ -0,0 +1,639 @@ +package datawave.data.normalizer.regex; + +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +import org.apache.commons.lang3.tuple.Pair; + +import datawave.data.normalizer.regex.visitor.StringVisitor; +import datawave.data.type.util.NumericalEncoder; + +public class RegexUtils { + + /** + * Split the given string by all top-level alternations into individual regex segments to be further evaluated. Any pipes encapsulated within groups, e.g. + * (1|2|3) will not count as alternations to split. See the following input examples: + *
    + *
  • Input {@code ""} will return the list {@code {""}}
  • + *
  • Input {@code "234.*"} will return the list {@code {"234.*"}}
  • + *
  • Input {@code "234.*|45|653.*"} will return the list {@code {"234.*", "45", "653.*"}}
  • + *
  • Input {@code "234.*|45|(3[34].*|4[54]3)} will return the list {@code {"234.*", "45", "(2[34].*|4[54]3)"}}
  • + *
  • Input {@code "|34"} will return the list {@code {"", "34"}}}
  • + *
  • Input {@code "34|"} will return the list {@code {"34", ""}}}
  • + *
  • Input {@code "||"} will return the list {@code {"", "", ""}}}
  • + *
  • Input {@code "|12||4|34|} will return the list {@code {"", "12", "", "4", "34"}}
  • + *
+ * + * @param str + * the string to split + * @return the split segments + */ + public static List splitOnAlternations(String str) { + List segments = new ArrayList<>(); + // If the string is empty, return a list containing an empty string. + if (str.isEmpty()) { + segments.add(""); + return segments; + } + + char[] chars = str.toCharArray(); + int strLength = chars.length; + int lastPos = strLength - 1; + int groupsToTraverse = 0; + int startOfSegment = 0; + // Stream over the string one character at a time. + for (int pos = 0; pos < strLength; pos++) { + char current = chars[pos]; + if (pos != lastPos) { + switch (current) { + case RegexConstants.PIPE: + // If we found an alternation, it is top-level only if there are no groups we currently need to traverse. + if (groupsToTraverse == 0) { + // If the start of the segment is not the current position, we have a non-zero length segment. + if (startOfSegment != pos) { + segments.add(str.substring(startOfSegment, (pos))); + } else { + // Otherwise we've encountered an empty alternation somewhere before the end of the string. + segments.add(""); + } + // Mark the start of the next segment as the next character. + startOfSegment = pos + 1; + } + break; + case RegexConstants.LEFT_PAREN: + // We found the start of a group. Increment the number of groups we need to traverse. + groupsToTraverse++; + break; + case RegexConstants.RIGHT_PAREN: + // We found the end of a group. Decrement the number of groups we need to traverse. + groupsToTraverse--; + break; + default: + } + } else { + // If the last character is not a pipe, it is part of the last segment. + if (current != RegexConstants.PIPE) { + segments.add(str.substring(startOfSegment)); + } else { + // If we have a zero-length segment, add an empty alternation. + if (startOfSegment == pos) { + segments.add(""); + } else { + // Otherwise the segment ends at the character before last. + segments.add(str.substring(startOfSegment, lastPos)); + } + // Add a trailing empty segment. + segments.add(""); + } + } + } + return segments; + } + + /** + * Return whether the regex consists of a single simple number without any special operations, e.g. '1', '1\\.0', '-1', '-1\\.0'. + */ + public static boolean isNumber(String str) { + char[] chars = str.toCharArray(); + int lastPos = chars.length - 1; + for (int currentPos = 0; currentPos <= lastPos; currentPos++) { + char current = chars[currentPos]; + switch (current) { + case RegexConstants.BACKSLASH: + case RegexConstants.HYPHEN: + case RegexConstants.ONE: + case RegexConstants.TWO: + case RegexConstants.THREE: + case RegexConstants.FOUR: + case RegexConstants.FIVE: + case RegexConstants.SIX: + case RegexConstants.SEVEN: + case RegexConstants.EIGHT: + case RegexConstants.NINE: + continue; + case RegexConstants.PERIOD: + // If we encounter a period at the beginning of the regex, we know it is a dot wildcard and not an escaped decimal point. + if (currentPos == 0) { + return false; + } else { + // If we encounter a period anywhere else in the regex, if it is not preceded by a backslash to indicate that it's an escaped decimal + // point, then it is a dot wildcard. + char prev = chars[(currentPos - 1)]; + if (prev != RegexConstants.BACKSLASH) { + return false; + } + } + break; + default: + // Any characters other than 0-9, -, or \. indicate a non-simple number regex. + return false; + } + } + return true; + } + + /** + * Returns the escaped, encoded form of a string containing a number from part of a regex. The string must be a number, and may be escaped. See the + * following input examples: + *
    + *
  • Input {@code "1.2"} will return {@code "\+aE1\.2"}
  • + *
  • Input {@code "1\.2"} will return {@code "\+aE1\.2"}
  • + *
  • Input {@code "12"} will return {@code "\+bE1\.2"}
  • + *
  • Input {@code "-1\.2"} will return {@code "\!ZE1\.2"}
  • + *
  • Input {@code "-12"} will return {@code "\!YE1\.2"}
  • + *
+ * + * @param str + * the string to encode + * @return the escaped, encoded number + */ + public static String encodeNumber(String str) { + return escapeEncodedNumber(NumericalEncoder.encode(removeBackslashes(str))); + } + + /** + * Return the given string with all backslashes removed from it. + * + * @param str + * the string + * @return the string without any backslashes + */ + public static String removeBackslashes(String str) { + return str.replaceAll(RegexConstants.ESCAPED_BACKSLASH, ""); + } + + /** + * Return an encoded whole number with the characters {@code . ! +} escaped by a backslash. + */ + public static String escapeEncodedNumber(String str) { + StringBuilder sb = new StringBuilder(); + for (char current : str.toCharArray()) { + if (current == RegexConstants.PERIOD || current == RegexConstants.PLUS) { + sb.append(RegexConstants.BACKSLASH); + } + sb.append(current); + } + return sb.toString(); + } + + /** + * Return the index of the first escaped period present in the children of the given node tree, or -1 if no such child is found. + * + * @param node + * the node + * @return the index of the first escaped period, or -1 if not found + */ + public static int getDecimalPointIndex(Node node) { + int index = node.indexOf(EscapedSingleCharNode.class); + while (index != -1) { + EscapedSingleCharNode escapedNode = (EscapedSingleCharNode) node.getChildAt(index); + if (escapedNode.getCharacter() == RegexConstants.PERIOD) { + return index; + } + index = node.indexOf(EscapedSingleCharNode.class, (index + 1)); + } + return -1; + } + + /** + * Returns whether the first child in the given node tree is a minus sign. + * + * @param node + * the node + * @return true if the first child is a minus sign, or false otherwise + */ + public static boolean isNegativeRegex(Node node) { + return isChar(node.getFirstChild(), RegexConstants.HYPHEN); + } + + /** + * Return whether the given node is an escaped period. + * + * @param node + * the node + * @return true if the given node is an escaped period, or false otherwise. + */ + public static boolean isDecimalPoint(Node node) { + return node instanceof EscapedSingleCharNode && ((EscapedSingleCharNode) node).getCharacter() == RegexConstants.PERIOD; + } + + /** + * Return whether the given node is the given character, escaped or otherwise. + * + * @param node + * the node + * @param character + * the character + * @return true if the given node is the given character, or false otherwise + */ + public static boolean isChar(Node node, char character) { + if (node instanceof SingleCharNode) { + return ((SingleCharNode) node).getCharacter() == character; + } else if (node instanceof EscapedSingleCharNode) { + return ((EscapedSingleCharNode) node).getCharacter() == character; + } + return false; + } + + /** + * Return whether the given node is a character class that would match against the given character. + * + * @param node + * the node + * @param character + * the character + * @return true if the given character class would match against the given character, or false otherwise + * @throws IllegalArgumentException + * if the given node is not a {@link CharClassNode} + */ + public static boolean charClassMatches(Node node, char character) { + if (node instanceof CharClassNode) { + CharClassNode charClass = (CharClassNode) node; + boolean matchFound = false; + for (Node child : charClass.getChildren()) { + // If the current child is a single character, see if it is a match for the character. + if (child instanceof SingleCharNode) { + if (isChar(child, character)) { + matchFound = true; + break; + } + } else { + // If the current child is a character range, see if it is within the range. + CharRangeNode charRange = (CharRangeNode) child; + int charDigit = Character.digit(character, RegexConstants.DECIMAL_RADIX); + int startDigit = Character.digit(charRange.getStart(), RegexConstants.DECIMAL_RADIX); + int endDigit = Character.digit(charRange.getEnd(), RegexConstants.DECIMAL_RADIX); + if (startDigit <= charDigit && charDigit <= endDigit) { + matchFound = true; + break; + } + } + } + // If the character class was negated, e.g. [^1-5], it matches against the character if no direct match was found. + return charClass.isNegated() != matchFound; + } else { + throw new IllegalArgumentException("Node must be a " + CharClassNode.class.getSimpleName()); + } + } + + /** + * Return whether the given node is a character class that would only match against the given character. + * + * @param node + * the node + * @param character + * the character + * @return true if the given character class would only match against the given character, or false otherwise + * @throws IllegalArgumentException + * if the given node is not a {@link CharClassNode} + */ + public static boolean charClassMatchesOnly(Node node, char character) { + if (node instanceof CharClassNode) { + CharClassNode charClass = (CharClassNode) node; + boolean matchFound = false; + for (Node child : charClass.getChildren()) { + // If the current child is a single character, see if it is a match for the character. + if (child instanceof SingleCharNode) { + if (isChar(child, character)) { + matchFound = true; + } else { + // A character other than the target was found. + return false; + } + } else { + // If the current child is a character range, see the range only encompasses the target character, e.g. [1-1]. + CharRangeNode charRange = (CharRangeNode) child; + int charDigit = Character.digit(character, RegexConstants.DECIMAL_RADIX); + int startDigit = Character.digit(charRange.getStart(), RegexConstants.DECIMAL_RADIX); + int endDigit = Character.digit(charRange.getEnd(), RegexConstants.DECIMAL_RADIX); + if (startDigit == charDigit && charDigit == endDigit) { + matchFound = true; + } else { + // A range encompassing characters other than the target was found. + return false; + } + } + } + // If the character class was negated, e.g. [^1], it matches against the character if no direct match was found. + return charClass.isNegated() != matchFound; + } else { + throw new IllegalArgumentException("Node must be a " + CharClassNode.class.getSimpleName()); + } + } + + /** + * Return whether the given node is a regex element that would match against the given character. + * + * @param node + * the regex element + * @param character + * the character + * @return true if the given node would match against the given character, or false otherwise + */ + public static boolean matchesChar(Node node, char character) { + switch (node.getType()) { + case DIGIT_CHAR_CLASS: + case ANY_CHAR: + return true; + case SINGLE_CHAR: + return isChar(node, character); + case CHAR_CLASS: + return charClassMatches(node, character); + default: + return false; + } + } + + public static boolean groupNodeMatches(Node node, char character) { + GroupNode group = (GroupNode) node; + boolean matchFound = false; + + for (Node child : group.getChildren()) { + // If the current child is a single character, see if it is a match for the character. + if (child instanceof SingleCharNode) { + if (isChar(child, character)) { + matchFound = true; + } else { + // A character other than the target was found, but there may be more in the group + continue; + } + } + } + return matchFound; + } + + /** + * Return whether the given node is a regex element that can only match against the given character. + * + * @param node + * the node + * @return true if the node can match only against the given character or false otherwise. + */ + public static boolean matchesCharOnly(Node node, char character) { + switch (node.getType()) { + case SINGLE_CHAR: + return isChar(node, character); + case CHAR_CLASS: + return charClassMatchesOnly(node, character); + default: + return false; + } + } + + /** + * Return whether the given node is a regex element that can match against the character '0'. + * + * @param node + * the node + * @return true if the node can match against '0' or false otherwise. + */ + public static boolean matchesZero(Node node) { + return matchesChar(node, RegexConstants.ZERO); + } + + public static boolean matchesCharExplicitly(Node node, char character) { + switch (node.getType()) { + case SINGLE_CHAR: + return isChar(node, character); + case CHAR_CLASS: + return charClassMatches(node, character); + case GROUP: + return groupNodeMatches(node, character); + default: + return false; + } + } + + public static boolean matchesZeroExplicitly(Node node) { + return matchesCharExplicitly(node, RegexConstants.ZERO); + } + + /** + * Return whether the given node is a regex element that can only match against the character '0'. + * + * @param node + * the node + * @return true if the node can match only against '0' or false otherwise. + */ + public static boolean matchesZeroOnly(Node node) { + return matchesCharOnly(node, RegexConstants.ZERO); + } + + /** + * Return whether the given node is a quantifier type. + * + * @param node + * the node + * @return true if the node is a quantifier type, or false otherwise + */ + public static boolean isQuantifier(Node node) { + return RegexConstants.QUANTIFIER_TYPES.contains(node.getClass()); + } + + /** + * Return a range representing the number of occurrences the given node can match against. The left side will be at a minimum, 0, and the right side may be + * a number, or null (infinity). + * + * @param node + * the node + * @return the occurrence range + * @throws IllegalArgumentException + * if the given node is not a quantifier type + */ + public static Pair getQuantifierRange(Node node) { + if (!isQuantifier(node)) { + throw new IllegalArgumentException("Node must be one of the following quantifier types: " + RegexConstants.QUANTIFIER_TYPES); + } + int min; + Integer max = null; + switch (node.getType()) { + case ZERO_OR_MORE: + // Minimum occurrence of 0. + min = 0; + break; + case ONE_OR_MORE: + // Minimum occurrence of 1. + min = 1; + break; + case REPETITION: + Node child = node.getFirstChild(); + if (child instanceof IntegerNode) { + // Minimum and maximum occurrences will be the same. + min = ((IntegerNode) child).getValue(); + max = min; + } else { + IntegerRangeNode rangeNode = (IntegerRangeNode) child; + // Minimum is defined in range. Maximum may be infinity if not defined. + min = rangeNode.getStart(); + if (rangeNode.isEndBounded()) { + max = rangeNode.getEnd(); + } + } + break; + default: + throw new IllegalArgumentException("Unhandled quantifier type: " + RegexConstants.QUANTIFIER_TYPES); + } + return Pair.of(min, max); + } + + /** + * Return whether the given node represents a simple number regex. + * + * @param node + * the node + * @return true if the node is a simple number regex, or false otherwise + */ + public static boolean isSimpleNumber(Node node) { + if (node.isAnyChildNotOf(RegexConstants.SIMPLE_NUMBER_TYPES)) { + return false; + } + String expression = StringVisitor.toString(node); + return RegexConstants.SIMPLE_NUMBER_REGEX_PATTERN.matcher(expression).matches(); + } + + /** + * Return the given digit character as an integer. + * + * @param digit + * the digit character + * @return the integer form + */ + public static int toInt(char digit) { + return Character.digit(digit, RegexConstants.DECIMAL_RADIX); + } + + /** + * Return the given int as a digit character. + * + * @param digit + * the int + * @return the digit character + */ + public static char toChar(int digit) { + return Character.forDigit(digit, RegexConstants.DECIMAL_RADIX); + } + + /** + * Return whether the given quantifier node allows for zero occurrences. + * + * @param node + * the node + * @return true if the quantifier allows for zero occurrences, or false otherwise + */ + public static boolean canOccurZeroTimes(Node node) { + if (!isQuantifier(node)) { + throw new IllegalArgumentException("Node must be one of the following quantifier types: " + RegexConstants.QUANTIFIER_TYPES); + } + switch (node.getType()) { + case ZERO_OR_MORE: + return true; + case ONE_OR_MORE: + return false; + case REPETITION: + return repetitionCanOccurZeroTimes((RepetitionNode) node); + default: + throw new IllegalArgumentException("Unhandled quantifier type: " + RegexConstants.QUANTIFIER_TYPES); + } + } + + /** + * Return whether the given repetition quantifier node allows for zero occurrences. + * + * @param node + * the node + * @return true if the quantifier allows for zero occurrences, or false otherwise + */ + public static boolean repetitionCanOccurZeroTimes(RepetitionNode node) { + Node child = node.getFirstChild(); + if (child instanceof IntegerNode) { + return ((IntegerNode) child).getValue() == 0; + } else { + return ((IntegerRangeNode) child).getStart() == 0; + } + } + + /** + * Return the given repetition as an occurrence range. + * + * @param node + * the node + * @return the range + */ + public static Pair getRepetitionAsRange(RepetitionNode node) { + Node child = node.getFirstChild(); + if (child instanceof IntegerNode) { + int value = ((IntegerNode) child).getValue(); + return Pair.of(value, value); + } else { + IntegerRangeNode integerRange = (IntegerRangeNode) child; + if (integerRange.isEndBounded()) { + return Pair.of(integerRange.getStart(), integerRange.getEnd()); + } else { + return Pair.of(integerRange.getStart(), null); + } + } + } + + /** + * Subtract one from the given range endpoints and return it. + * + * @param range + * the range + * @return the updated range + */ + public static Pair subtractOneFrom(Pair range) { + int left = range.getLeft() > 0 ? (range.getLeft() - 1) : 0; + Integer right = range.getRight() == null ? null : (range.getRight() - 1); + return Pair.of(left, right); + } + + /** + * Return a new repetition node created from the given range. + * + * @param range + * the range + * @return the new repetition node + */ + public static RepetitionNode createRepetition(Pair range) { + if (Objects.equals(range.getLeft(), range.getRight())) { + return new RepetitionNode(new IntegerNode(range.getLeft())); + } else { + return new RepetitionNode(new IntegerRangeNode(range.getLeft(), range.getRight())); + } + } + + /** + * Return whether the given repetition quantifier is not a defined range, e.g. {x} rather than {x,y} or {x,}. + * + * @param node + * the node + * @return true if the repetition is not a range, or false otherwise + */ + public static boolean isNotRange(RepetitionNode node) { + return node.getFirstChild() instanceof IntegerNode; + } + + /** + * Return a copy of the given repetition as a range starting from zero. + * + * @param node + * the node + * @return the new repetition quantifier + */ + public static RepetitionNode createRangeStartingFromZero(RepetitionNode node) { + IntegerRangeNode range = new IntegerRangeNode(); + range.setStart(0); + Node child = node.getFirstChild(); + if (child instanceof IntegerNode) { + range.setEnd(((IntegerNode) child).getValue()); + } else { + range.setEnd(((IntegerRangeNode) child).getEnd()); + } + return new RepetitionNode(range); + } + + private RegexUtils() { + throw new UnsupportedOperationException(); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RepetitionNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RepetitionNode.java new file mode 100644 index 00000000000..b31f006b0ce --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RepetitionNode.java @@ -0,0 +1,36 @@ +package datawave.data.normalizer.regex; + +import java.util.Map; + +import datawave.data.normalizer.regex.visitor.Visitor; + +/** + * Represents a repetition requirement in a regex pattern, e.g. {@code {3}}. + */ +public class RepetitionNode extends Node { + + public RepetitionNode() {} + + public RepetitionNode(Node child) { + super(child); + } + + public RepetitionNode(Map properties) { + super(properties); + } + + @Override + public NodeType getType() { + return NodeType.REPETITION; + } + + @Override + public Object accept(Visitor visitor, Object data) { + return visitor.visitRepetition(this, data); + } + + @Override + public RepetitionNode shallowCopy() { + return new RepetitionNode(this.properties); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/SingleCharNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/SingleCharNode.java new file mode 100644 index 00000000000..50e42a9621d --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/SingleCharNode.java @@ -0,0 +1,46 @@ +package datawave.data.normalizer.regex; + +import java.util.Map; + +import datawave.data.normalizer.regex.visitor.Visitor; + +/** + * Represents a single, non-special character in a regex pattern. + */ +public class SingleCharNode extends Node { + + public static final String PROPERTY_CHAR = "char"; + + public SingleCharNode(char character) { + setCharacter(character); + } + + public char getCharacter() { + return getProperty(PROPERTY_CHAR).charAt(0); + } + + public void setCharacter(char character) { + setProperty(PROPERTY_CHAR, String.valueOf(character)); + } + + public SingleCharNode() {} + + public SingleCharNode(Map properties) { + super(properties); + } + + @Override + public NodeType getType() { + return NodeType.SINGLE_CHAR; + } + + @Override + public Object accept(Visitor visitor, Object data) { + return visitor.visitSingleChar(this, data); + } + + @Override + public SingleCharNode shallowCopy() { + return new SingleCharNode(this.properties); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/StartAnchorNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/StartAnchorNode.java new file mode 100644 index 00000000000..77d3831931d --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/StartAnchorNode.java @@ -0,0 +1,34 @@ +package datawave.data.normalizer.regex; + +import java.util.Map; + +import datawave.data.normalizer.regex.visitor.Visitor; + +/** + * Represents a regex start anchor, i.e. {@code ^}. + */ +public class StartAnchorNode extends Node { + + protected StartAnchorNode() { + super(); + } + + public StartAnchorNode(Map properties) { + super(properties); + } + + @Override + public NodeType getType() { + return NodeType.START_ANCHOR; + } + + @Override + public Object accept(Visitor visitor, Object data) { + return visitor.visitStartAnchor(this, data); + } + + @Override + public Node shallowCopy() { + return new StartAnchorNode(this.properties); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/ZeroOrMoreNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/ZeroOrMoreNode.java new file mode 100644 index 00000000000..547ab550fad --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/ZeroOrMoreNode.java @@ -0,0 +1,32 @@ +package datawave.data.normalizer.regex; + +import java.util.Map; + +import datawave.data.normalizer.regex.visitor.Visitor; + +/** + * Represents the star in a regex pattern. + */ +public class ZeroOrMoreNode extends Node { + + public ZeroOrMoreNode() {} + + public ZeroOrMoreNode(Map properties) { + super(properties); + } + + @Override + public NodeType getType() { + return NodeType.ZERO_OR_MORE; + } + + @Override + public Object accept(Visitor visitor, Object data) { + return visitor.visitZeroToMany(this, data); + } + + @Override + public ZeroOrMoreNode shallowCopy() { + return new ZeroOrMoreNode(this.properties); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/AlternationDeduper.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/AlternationDeduper.java new file mode 100644 index 00000000000..8585bee40d6 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/AlternationDeduper.java @@ -0,0 +1,57 @@ +package datawave.data.normalizer.regex.visitor; + +import java.util.LinkedHashMap; + +import datawave.data.normalizer.regex.AlternationNode; +import datawave.data.normalizer.regex.ExpressionNode; +import datawave.data.normalizer.regex.Node; + +public class AlternationDeduper extends CopyVisitor { + + public static Node dedupe(Node node) { + if (node == null) { + return null; + } + AlternationDeduper visitor = new AlternationDeduper(); + return (Node) node.accept(visitor, null); + } + + @Override + public Object visitExpression(ExpressionNode node, Object data) { + // If the node holds an alternation, dedupe the alternation's children. + if (node.getFirstChild() instanceof AlternationNode) { + Node visited = (Node) node.getFirstChild().accept(this, data); + // If an alternation was returned, multiple patterns were retained. Wrap it in an expression node before returning. + if (visited instanceof AlternationNode) { + return new ExpressionNode(visited); + } else { + // Otherwise we only have a single pattern remaining. Return the node as is. + return visited; + } + } else { + // Otherwise this tree does not hold any alternations. Return a copy. + return copy(node); + } + } + + @Override + public Object visitAlternation(AlternationNode node, Object data) { + // Use LinkedHashMap to preserve insertion order. + LinkedHashMap uniquePatterns = new LinkedHashMap<>(); + // Check each child for uniqueness. + for (Node child : node.getChildren()) { + String childPattern = StringVisitor.toString(child); + // If the child has a pattern we have not seen before, retain a copy of it. + if (!uniquePatterns.containsKey(childPattern)) { + uniquePatterns.put(childPattern, copy(child)); + } + } + + // If only one + if (uniquePatterns.size() == 1) { + return uniquePatterns.values().iterator().next(); + } else { + return new AlternationNode(uniquePatterns.values()); + } + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/AnchorTrimmer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/AnchorTrimmer.java new file mode 100644 index 00000000000..2925e43c380 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/AnchorTrimmer.java @@ -0,0 +1,29 @@ +package datawave.data.normalizer.regex.visitor; + +import datawave.data.normalizer.regex.EndAnchorNode; +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.StartAnchorNode; + +/** + * Implementation of {@link CopyVisitor} that returns a copy of a regex tree trimmed of all start and end anchors to simplify the normalization process. + */ +public class AnchorTrimmer extends CopyVisitor { + + public static Node trim(Node node) { + if (node == null) { + return null; + } + AnchorTrimmer visitor = new AnchorTrimmer(); + return (Node) node.accept(visitor, null); + } + + @Override + public Object visitStartAnchor(StartAnchorNode node, Object data) { + return null; + } + + @Override + public Object visitEndAnchor(EndAnchorNode node, Object data) { + return null; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/BaseVisitor.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/BaseVisitor.java new file mode 100644 index 00000000000..a31aea2c57e --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/BaseVisitor.java @@ -0,0 +1,148 @@ +package datawave.data.normalizer.regex.visitor; + +import datawave.data.normalizer.regex.AlternationNode; +import datawave.data.normalizer.regex.AnyCharNode; +import datawave.data.normalizer.regex.CharClassNode; +import datawave.data.normalizer.regex.CharRangeNode; +import datawave.data.normalizer.regex.DigitCharClassNode; +import datawave.data.normalizer.regex.EmptyNode; +import datawave.data.normalizer.regex.EncodedNumberNode; +import datawave.data.normalizer.regex.EncodedPatternNode; +import datawave.data.normalizer.regex.EndAnchorNode; +import datawave.data.normalizer.regex.EscapedSingleCharNode; +import datawave.data.normalizer.regex.ExpressionNode; +import datawave.data.normalizer.regex.GroupNode; +import datawave.data.normalizer.regex.IntegerNode; +import datawave.data.normalizer.regex.IntegerRangeNode; +import datawave.data.normalizer.regex.OneOrMoreNode; +import datawave.data.normalizer.regex.QuestionMarkNode; +import datawave.data.normalizer.regex.RepetitionNode; +import datawave.data.normalizer.regex.SingleCharNode; +import datawave.data.normalizer.regex.StartAnchorNode; +import datawave.data.normalizer.regex.ZeroOrMoreNode; + +/** + * A basic {@link Visitor} implementation that will pass itself to the children of any node that accepts it. + */ +public class BaseVisitor implements Visitor { + + @Override + public Object visitExpression(ExpressionNode node, Object data) { + node.childrenAccept(this, data); + return data; + } + + @Override + public Object visitAlternation(AlternationNode node, Object data) { + node.childrenAccept(this, data); + return data; + } + + @Override + public Object visitGroup(GroupNode node, Object data) { + node.childrenAccept(this, data); + return data; + } + + @Override + public Object visitDigitChar(DigitCharClassNode node, Object data) { + node.childrenAccept(this, data); + return data; + } + + @Override + public Object visitCharClass(CharClassNode node, Object data) { + node.childrenAccept(this, data); + return data; + } + + @Override + public Object visitCharRange(CharRangeNode node, Object data) { + node.childrenAccept(this, data); + return data; + } + + @Override + public Object visitSingleChar(SingleCharNode node, Object data) { + node.childrenAccept(this, data); + return data; + } + + @Override + public Object visitEscapedSingleChar(EscapedSingleCharNode node, Object data) { + node.childrenAccept(this, data); + return data; + } + + @Override + public Object visitRepetition(RepetitionNode node, Object data) { + node.childrenAccept(this, data); + return data; + } + + @Override + public Object visitQuestionMark(QuestionMarkNode node, Object data) { + node.childrenAccept(this, data); + return data; + } + + @Override + public Object visitAnyChar(AnyCharNode node, Object data) { + node.childrenAccept(this, data); + return data; + } + + @Override + public Object visitZeroToMany(ZeroOrMoreNode node, Object data) { + node.childrenAccept(this, data); + return data; + } + + @Override + public Object visitOneToMany(OneOrMoreNode node, Object data) { + node.childrenAccept(this, data); + return data; + } + + @Override + public Object visitInteger(IntegerNode node, Object data) { + node.childrenAccept(this, data); + return data; + } + + @Override + public Object visitIntegerRange(IntegerRangeNode node, Object data) { + node.childrenAccept(this, data); + return data; + } + + @Override + public Object visitEmpty(EmptyNode node, Object data) { + node.childrenAccept(this, data); + return data; + } + + @Override + public Object visitStartAnchor(StartAnchorNode node, Object data) { + node.childrenAccept(this, data); + return data; + } + + @Override + public Object visitEndAnchor(EndAnchorNode node, Object data) { + node.childrenAccept(this, data); + return data; + } + + @Override + public Object visitEncodedNumber(EncodedNumberNode node, Object data) { + node.childrenAccept(this, data); + return data; + } + + @Override + public Object visitEncodedPattern(EncodedPatternNode node, Object data) { + node.childrenAccept(this, data); + return data; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/BinFinder.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/BinFinder.java new file mode 100644 index 00000000000..7f6fe611842 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/BinFinder.java @@ -0,0 +1,210 @@ +package datawave.data.normalizer.regex.visitor; + +import org.apache.commons.lang3.tuple.Pair; + +import datawave.data.normalizer.regex.IntegerNode; +import datawave.data.normalizer.regex.IntegerRangeNode; +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.NodeListIterator; +import datawave.data.normalizer.regex.RegexConstants; +import datawave.data.normalizer.regex.RegexUtils; + +/** + * Abstract class for {@link LTOneBinFinder} and {@link GTEOneBinFinder} with common properties and functionality. + */ +abstract class BinFinder { + + // The original node. + protected final Node node; + + // An iterator for the node's children. + protected final NodeListIterator childrenIter; + + // The index of the decimal point in the node's children, possibly -1. + protected final int decimalPointIndex; + + // The smallest bin value. + protected final int minBin; + + // The highest bin value. + protected final int maxBin; + + // The initial value for the lower and upper endpoints. + protected final int initialEndpointValue; + + // The current lower end of the bin range. + protected int lower; + + // The current upper end of the bin range. + protected int upper; + + protected boolean lowerLocked; + + protected BinFinder(Node node, int minBin, int maxBin, int initialEndpointValue) { + this.node = node; + this.decimalPointIndex = RegexUtils.getDecimalPointIndex(node); + this.minBin = minBin; + this.maxBin = maxBin; + this.initialEndpointValue = initialEndpointValue; + this.childrenIter = node.getChildrenIterator(); + + // Set the initial end point values. + this.lower = initialEndpointValue; + this.upper = initialEndpointValue; + + // If the first child is a hyphen, skip over it and start at the next child. + if (RegexUtils.isChar(node.getFirstChild(), RegexConstants.HYPHEN)) { + childrenIter.next(); + } + } + + protected abstract Pair getBinRange(); + + /** + * Increment lower by one. + */ + protected void incrementLower() { + if (!lowerLocked) { + lower++; + } + } + + /** + * Increment lower by the given value. + * + * @param value + * the value + */ + protected void incrementLower(int value) { + if (!lowerLocked) { + lower += value; + } + } + + /** + * Lock modifications to the lower bound. Any subsequent calls to {@link #incrementLower()} or {@link #incrementLower(int)} will not modify the lower bound. + */ + protected void lockLower() { + this.lowerLocked = true; + } + + /** + * Unlock modifications to the lower bound. Any subsequent calls to {@link #incrementLower()} or {@link #incrementLower(int)} will modify the lower bound. + */ + protected void unlockLower() { + this.lowerLocked = false; + } + + /** + * Set lower to the initial endpoint value. + */ + protected void setLowerToInitialEndpointValue() { + this.lower = initialEndpointValue; + } + + /** + * Increment upper by one. + */ + protected void incrementUpper() { + upper++; + } + + /** + * Increment upper by the given value. + * + * @param value + * the value + */ + protected void incrementUpper(int value) { + upper += value; + } + + /** + * Set upper to the max bin value. + */ + protected void setUpperToMax() { + upper = maxBin; + } + + /** + * Normalize the endpoints to be within the min and max bin if they were updated. + */ + protected void normalizeRange() { + // Do not normalize if both the upper and lower are the initial endpoint value. This indicates that a valid bin range was not found. + if (lower != initialEndpointValue || upper != initialEndpointValue) { + // Normalize the bin range to be within a valid bin range. If the lower bound is less than the min bin, set it to the min bin. If it is greater than + // the max bin, set it to the max bin. + if (lower < minBin) { + lower = minBin; + } else if (lower > maxBin) { + lower = maxBin; + } + + // If the upper bound is greater than the max bin, set it to the max bin. + if (upper > maxBin) { + upper = maxBin; + } + } + } + + /** + * Return a {@link Pair} with the lower and upper bin range endpoints, or null if no valid bin range was found. + * + * @return the bin range + */ + protected Pair getEndpoints() { + if (lower != initialEndpointValue || upper != initialEndpointValue) { + return Pair.of(lower, upper); + } else { + return null; + } + } + + /** + * Update lower and upper based on the quantities read from the next quantifier in the iterator. + */ + protected void updateRangeWithNextQuantifier() { + // Update the range. + updateRangeWithQuantifier(childrenIter.next()); + // If the node after the quantifier node is an question mark, skip over it. + childrenIter.seekPastQuestionMarks(); + } + + /** + * Update lower and upper based off the quantities read from the next quantifier. + */ + protected void updateRangeWithQuantifier(Node quantifier) { + switch (quantifier.getType()) { + case REPETITION: + // In the case of a repetition node, we may have an IntegerNode or IntegerRangeNode child. + Node child = quantifier.getFirstChild(); + if (child instanceof IntegerNode) { + // Increment both the upper and lower bound by the repetition value. + int value = ((IntegerNode) child).getValue(); + incrementLower(value); + incrementUpper(value); + } else { + IntegerRangeNode rangeNode = (IntegerRangeNode) child; + // Increment the lower bound by the range start value. + incrementLower(rangeNode.getStart()); + // If the end of the range has a bound, increment the upper bound by the end bound. Otherwise, set the upper bound to the max. + if (rangeNode.isEndBounded()) { + incrementUpper(rangeNode.getEnd()); + } else { + setUpperToMax(); + } + } + break; + case ZERO_OR_MORE: + // Set the upper to the max. Do not modify the lower bound. + setUpperToMax(); + break; + case ONE_OR_MORE: + // Set the upper bound to the max. + setUpperToMax(); + // Increment the lower bound by one. + incrementLower(); + break; + } + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/CopyVisitor.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/CopyVisitor.java new file mode 100644 index 00000000000..ce7620c1ec9 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/CopyVisitor.java @@ -0,0 +1,161 @@ +package datawave.data.normalizer.regex.visitor; + +import java.util.Objects; + +import datawave.data.normalizer.regex.AlternationNode; +import datawave.data.normalizer.regex.AnyCharNode; +import datawave.data.normalizer.regex.CharClassNode; +import datawave.data.normalizer.regex.CharRangeNode; +import datawave.data.normalizer.regex.DigitCharClassNode; +import datawave.data.normalizer.regex.EmptyNode; +import datawave.data.normalizer.regex.EncodedNumberNode; +import datawave.data.normalizer.regex.EncodedPatternNode; +import datawave.data.normalizer.regex.EndAnchorNode; +import datawave.data.normalizer.regex.EscapedSingleCharNode; +import datawave.data.normalizer.regex.ExpressionNode; +import datawave.data.normalizer.regex.GroupNode; +import datawave.data.normalizer.regex.IntegerNode; +import datawave.data.normalizer.regex.IntegerRangeNode; +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.OneOrMoreNode; +import datawave.data.normalizer.regex.QuestionMarkNode; +import datawave.data.normalizer.regex.RepetitionNode; +import datawave.data.normalizer.regex.SingleCharNode; +import datawave.data.normalizer.regex.StartAnchorNode; +import datawave.data.normalizer.regex.ZeroOrMoreNode; + +/** + * A {@link Visitor} implementation that returns a copy of a given {@link Node} tree. + */ +public class CopyVisitor implements Visitor { + + /** + * Return a copy of the given node tree, or null if the node is null. Any null children will be filtered out. + * + * @param node + * the tree to copy + * @return the copy + */ + public static Node copy(Node node) { + if (node == null) { + return null; + } + CopyVisitor visitor = new CopyVisitor(); + return (Node) node.accept(visitor, null); + } + + /** + * Return a copy of the given node. + * + * @param node + * the node to copy + * @param data + * the data + * @return the copy + */ + protected Node copy(Node node, Object data) { + Node copy = node.shallowCopy(); + node.getChildren().stream().map((child) -> (Node) child.accept(this, data)).filter(Objects::nonNull).forEach(copy::addChild); + return copy; + } + + @Override + public Object visitExpression(ExpressionNode node, Object data) { + return copy(node, data); + } + + @Override + public Object visitAlternation(AlternationNode node, Object data) { + return copy(node, data); + } + + @Override + public Object visitGroup(GroupNode node, Object data) { + return copy(node, data); + } + + @Override + public Object visitDigitChar(DigitCharClassNode node, Object data) { + return copy(node, data); + } + + @Override + public Object visitCharClass(CharClassNode node, Object data) { + return copy(node, data); + } + + @Override + public Object visitCharRange(CharRangeNode node, Object data) { + return copy(node, data); + } + + @Override + public Object visitSingleChar(SingleCharNode node, Object data) { + return copy(node, data); + } + + @Override + public Object visitEscapedSingleChar(EscapedSingleCharNode node, Object data) { + return copy(node, data); + } + + @Override + public Object visitRepetition(RepetitionNode node, Object data) { + return copy(node, data); + } + + @Override + public Object visitQuestionMark(QuestionMarkNode node, Object data) { + return copy(node, data); + } + + @Override + public Object visitAnyChar(AnyCharNode node, Object data) { + return copy(node, data); + } + + @Override + public Object visitZeroToMany(ZeroOrMoreNode node, Object data) { + return copy(node, data); + } + + @Override + public Object visitOneToMany(OneOrMoreNode node, Object data) { + return copy(node, data); + } + + @Override + public Object visitInteger(IntegerNode node, Object data) { + return copy(node, data); + } + + @Override + public Object visitIntegerRange(IntegerRangeNode node, Object data) { + return copy(node, data); + } + + @Override + public Object visitEmpty(EmptyNode node, Object data) { + return copy(node, data); + } + + @Override + public Object visitStartAnchor(StartAnchorNode node, Object data) { + return copy(node, data); + } + + @Override + public Object visitEndAnchor(EndAnchorNode node, Object data) { + return copy(node, data); + } + + @Override + public Object visitEncodedNumber(EncodedNumberNode node, Object data) { + return copy(node, data); + } + + @Override + public Object visitEncodedPattern(EncodedPatternNode node, Object data) { + return copy(node, data); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/DecimalPointPlacer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/DecimalPointPlacer.java new file mode 100644 index 00000000000..019d91a9e14 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/DecimalPointPlacer.java @@ -0,0 +1,561 @@ +package datawave.data.normalizer.regex.visitor; + +import static datawave.data.normalizer.regex.RegexUtils.createRepetition; +import static datawave.data.normalizer.regex.RegexUtils.getRepetitionAsRange; +import static datawave.data.normalizer.regex.RegexUtils.subtractOneFrom; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import datawave.data.normalizer.regex.AlternationNode; +import datawave.data.normalizer.regex.EncodedPatternNode; +import datawave.data.normalizer.regex.EscapedSingleCharNode; +import datawave.data.normalizer.regex.ExpressionNode; +import datawave.data.normalizer.regex.GroupNode; +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.NodeListIterator; +import datawave.data.normalizer.regex.NodeType; +import datawave.data.normalizer.regex.OneOrMoreNode; +import datawave.data.normalizer.regex.QuestionMarkNode; +import datawave.data.normalizer.regex.RegexConstants; +import datawave.data.normalizer.regex.RegexUtils; +import datawave.data.normalizer.regex.RepetitionNode; +import datawave.data.normalizer.regex.ZeroOrMoreNode; + +/** + * Implementation of {@link CopyVisitor} that return a copy of a regex tree with decimal places inserted where required in encoded regex patterns. Patterns + * starting with an element that has a quantifier {@code (* + or {x})} will see the quantifier modified as required to ensure a decimal place is inserted + * correctly. Multiple optional decimal points may be added to a single regex pattern. + */ +public class DecimalPointPlacer extends CopyVisitor { + + public static Node addDecimalPoints(Node node) { + if (node == null) { + return null; + } + DecimalPointPlacer visitor = new DecimalPointPlacer(); + return (Node) node.accept(visitor, null); + } + + @Override + public Object visitEncodedPattern(EncodedPatternNode node, Object data) { + // Operate on a copy of the node. + Node copy = copy(node); + + // Create an initial encoded pattern node with all the leading bin info. + EncodedPatternNode encodedPattern = new EncodedPatternNode(); + NodeListIterator iter = copy.getChildrenIterator(); + while (iter.hasNext()) { + Node next = iter.next(); + encodedPattern.addChild(next); + if (RegexUtils.isChar(next, RegexConstants.CAPITAL_E)) { + break; + } + } + + // Determine what character is equivalent to the zero character. For patterns matching positive numbers, this is '0'. For patterns matching negative + // numbers, this is '9'. + boolean positiveNumber = RegexUtils.isChar(node.getFirstChild(), RegexConstants.PLUS); + char zeroChar = positiveNumber ? RegexConstants.ZERO : RegexConstants.NINE; + + // Get a list of nodes with decimal points added and add them to the pattern node. + DecimalPointAdder adder = new DecimalPointAdder(iter, zeroChar); + List nodes = adder.addDecimalPoints(); + encodedPattern.addChildren(nodes); + + // Add the remaining children to the pattern node. + while (iter.hasNext()) { + encodedPattern.addChild(iter.next()); + } + return encodedPattern; + } + + private static class DecimalPointAdder { + + // The node iterator. + private final NodeListIterator iter; + + // The character that is the equivalent to zero. For patterns matching positive numbers: '0'. For patterns matching negative numbers: '9'. + private final char zeroChar; + + // The nodes enriched with decimal points. + private final List nodes = new ArrayList<>(); + + // The most recent element. + private Node currentElement; + + // The most recent quantifier. + private Node currentQuantifier; + + // The most recent question mark. + private Node currentQuestionMark; + + // Whether any decimal points have been added. + boolean addedAnyDecimalPoints; + + // Whether additional optional decimal points should be added. + boolean addMoreDecimalPoints = true; + + // Whether a non-leading zero has been seen. + boolean nonLeadingZeroSeen = false; + + public DecimalPointAdder(NodeListIterator iter, char zeroChar) { + this(iter, zeroChar, false); + } + + private DecimalPointAdder(NodeListIterator iter, char zeroChar, boolean addedAnyDecimalPoints) { + this.iter = iter; + this.zeroChar = zeroChar; + this.addedAnyDecimalPoints = addedAnyDecimalPoints; + } + + /** + * Return a list of nodes enriched with decimal points. This list is not guaranteed to contain all nodes found within the iterator supplied to + * {@link #DecimalPointAdder(NodeListIterator, char)}, so subsequent calls to {@link NodeListIterator#next()} should be made to the iterator after the + * fact to retrieve any remaining nodes. + */ + public List addDecimalPoints() { + // If we can skip adding decimal points, do so. + if (skipAddingDecimalPoints()) { + return nodes; + } + + // Add decimal points until either there are no more elements or if we have created a final decimal point. + while (iter.hasNext() && addMoreDecimalPoints) { + // Capture the current element, quantifier, and optional. + captureNext(); + + switch (currentElement.getType()) { + case GROUP: + addGroup(); + break; + case ANY_CHAR: + case CHAR_CLASS: + case DIGIT_CHAR_CLASS: + case SINGLE_CHAR: + // If we have seen a non-leading zero, mark it so. + if (!matchesZero(currentElement)) { + nonLeadingZeroSeen = true; + } + // Quantified characters must be handled differently from non-quantified characters. + if (currentQuantifier == null) { + addNonQuantifiedElement(); + } else { + addQuantifiedElement(); + } + break; + default: + throw new IllegalArgumentException("Unhandled element type: " + currentElement.getType()); + } + + // Mark whether we've added any decimal points only after processing the first decimal point. + addedAnyDecimalPoints = true; + } + + return nodes; + } + + /** + * Return whether the entire pattern after the bin information consists of .*, .+, or a non-quantified element. + * + * @return true if decimal points do not need to be added to this pattern, or false otherwise + */ + private boolean skipAddingDecimalPoints() { + int originalIndex = iter.index(); + try { + Node element = iter.next(); + Node quantifier = iter.isNextQuantifier() ? iter.next() : null; + iter.seekPastQuestionMarks(); + + // If there is a second element, we cannot skip adding decimal points. + if (iter.hasNext()) { + return false; + } else { + // If the sole element is a wildcard, we do not need to add decimal points if it is '.' '.*' or '.+'. + if (element.getType() == NodeType.ANY_CHAR) { + return quantifier == null || quantifier instanceof ZeroOrMoreNode || quantifier instanceof OneOrMoreNode; + } else if (element.getType() == NodeType.GROUP) { + // If the sole element is a group, we likely need to add decimal points. + return false; + } else { + // If sole element is not a wildcard, but has no quantifier, we do not need to add decimal points. + return quantifier == null; + } + } + + } finally { + iter.setIndex(originalIndex); + } + } + + /** + * Capture the next element, quantifier, and optional. + */ + private void captureNext() { + currentElement = iter.next(); + currentQuantifier = iter.isNextQuantifier() ? iter.next() : null; + currentQuestionMark = iter.isNextQuestionMark() ? iter.next() : null; + } + + /** + * The current element is either an optional group of leading zeros with a defined range that must occur more than once, or a group of ending + * alternations. + */ + private void addGroup() { + if (currentElement.getFirstChild().getType() == NodeType.ALTERNATION) { + addEndingAlternationsGroup(); + } else { + addLeadingZeroGroup(); + } + } + + private void addLeadingZeroGroup() { + Node innerElement = currentElement.getFirstChild(); + Node innerQuantifier = currentElement.getChildAt(1); + Node innerQuestionMark = currentElement.getChildCount() == 3 ? currentElement.getChildAt(2) : null; + + // If the inner element can only match zero, we do not need to insert any decimal points. Add them as is. + if (matchesZeroOnly(innerElement)) { + addAllCurrentToNodes(); + } else { + // Get the group's children with a decimal point inserted where appropriate. Require the decimal point to be optional. + List nodes = getRepetitionQuantifiedElements(innerElement, innerQuantifier, innerQuestionMark, true); + GroupNode groupNode = new GroupNode(); + groupNode.addChildren(nodes); + this.nodes.add(groupNode); + this.nodes.add(new QuestionMarkNode()); + } + } + + private void addEndingAlternationsGroup() { + // The current element is a group with an alternation child that has expressions that we may need to add decimal points to. + AlternationNode alternation = new AlternationNode(); + for (Node expression : currentElement.getFirstChild().getChildren()) { + NodeListIterator expressionIter = expression.getChildrenIterator(); + DecimalPointAdder adder = new DecimalPointAdder(expressionIter, zeroChar, addedAnyDecimalPoints); + List nodes = adder.addDecimalPoints(); + while (expressionIter.hasNext()) { + nodes.add(expressionIter.next()); + } + ExpressionNode newExpression = new ExpressionNode(nodes); + alternation.addChild(newExpression); + } + this.nodes.add(new GroupNode(alternation)); + } + + /** + * Add a decimal point based on a current element that is not quantified. + */ + private void addNonQuantifiedElement() { + // Add the current nodes. + addCurrentElementToNodes(); + addCurrentQuestionMarkToNodes(); + + // If this is the last element in the regex expression, do not add any decimal points. + if (!iter.hasNext()) { + return; + } + + // Add a decimal point. + addDecimalPointToNodes(); + + if (currentQuestionMark != null) { + // If the current element is optional, make the decimal point optional. + addQuestionMarkToNodes(); + } else { + // Otherwise if we have added any optional decimal points before this one, or the remaining pattern can be zero-length, make the decimal point + // optional. + if (addedAnyDecimalPoints || remainingPatternCanBeZeroLength()) { + addQuestionMarkToNodes(); + } + // Stop adding more decimal points. + addMoreDecimalPoints = false; + } + } + + /** + * Add decimal points based on a current element that is quantified. + */ + private void addQuantifiedElement() { + switch (currentQuantifier.getType()) { + case ZERO_OR_MORE: + // Add decimal point for quantifier *. + addZeroOrMoreQuantifiedElement(); + break; + case ONE_OR_MORE: + // Add decimal point for quantifier +. + addOneOrMoreQuantifiedElement(); + break; + case REPETITION: + // Add decimal point for quantifier {x}. + this.nodes.addAll(getRepetitionQuantifiedElements(currentElement, currentQuantifier, currentQuestionMark, false)); + break; + } + } + + /** + * Add a decimal point for a current element that is followed by *. + */ + private void addZeroOrMoreQuantifiedElement() { + // If the current element is a wildcard, we're looking at .* and can add it as is. + if (currentElement.getType() == NodeType.ANY_CHAR) { + addAllCurrentToNodes(); + } else { + // Add an optional variant of the current element. + addCurrentElementToNodes(); + addQuestionMarkToNodes(); + // Add an optional decimal point. + addDecimalPointToNodes(); + addQuestionMarkToNodes(); + // Add the current element again, followed by the current quantifier and optional. + addAllCurrentToNodes(); + } + } + + /** + * Add a decimal point for a current element that is followed by +. + */ + private void addOneOrMoreQuantifiedElement() { + // Add the current element, non-optional. + addCurrentElementToNodes(); + // Add an optional decimal point. + addDecimalPointToNodes(); + addQuestionMarkToNodes(); + // Add the current element again, but this time followed by a *, as well as the current optional. + addCurrentElementToNodes(); + nodes.add(new ZeroOrMoreNode()); + addCurrentQuestionMarkToNodes(); + // Do not add any more decimal points after this. + addMoreDecimalPoints = false; + } + + /** + * Add decimal points for a current element that is followed by a repetition. + */ + private List getRepetitionQuantifiedElements(Node element, Node quantifier, Node questionMark, boolean makeDecimalOptional) { + List nodes = new ArrayList<>(); + + // Add an initial copy of the current element. + nodes.add(copy(element)); + + // Get the repetition range from the quantifier node. + Pair repetitionRange = getRepetitionAsRange((RepetitionNode) quantifier); + boolean elementMarkedOptional = false; + if (repetitionRange.getLeft() == 0) { + // If the repetition range starts with 0, either {0,} or {0,x}, make the first occurrence of the element optional. + nodes.add(new QuestionMarkNode()); + elementMarkedOptional = true; + } + + // Subtract one from both endpoints of the repetition since we have added an initial single copy of the element to the nodes already. What we do + // next will depend on what the updated repetition range now covers. + repetitionRange = subtractOneFrom(repetitionRange); + + // The new repetition range is {0,}, which is equivalent to *. + if (repetitionRange.getLeft() == 0 && repetitionRange.getRight() == null) { + nodes.add(createDecimalPoint()); + nodes.add(new QuestionMarkNode()); + nodes.add(copy(element)); + nodes.add(new ZeroOrMoreNode()); + if (questionMark != null) { + nodes.add(copy(questionMark)); + } + } else if (repetitionRange.getLeft() == 1 && repetitionRange.getRight() == null) { + // The new repetition range is {1,}, which is equivalent to +. + nodes.add(createDecimalPoint()); + if (makeDecimalOptional) { + nodes.add(new QuestionMarkNode()); + } + nodes.add(copy(element)); + nodes.add(new OneOrMoreNode()); + if (questionMark != null) { + nodes.add(copy(questionMark)); + } + } else if (repetitionRange.getRight() == null) { + // The new repetition range is {x,}. + nodes.add(createDecimalPoint()); + if (makeDecimalOptional) { + nodes.add(new QuestionMarkNode()); + } + nodes.add(copy(element)); + nodes.add(createRepetition(repetitionRange)); + if (questionMark != null) { + nodes.add(copy(questionMark)); + } + } else if (repetitionRange.getLeft() == 0 && repetitionRange.getRight() > 0) { + // The new repetition range is {0,x}. + nodes.add(createDecimalPoint()); + // If either we're looking at an optional group, or we have added any decimal points before, or we have not seen a non-leading zero, + // or there is only one more element, or the remaining pattern can be zero-length, make the decimal point optional. + if (iter.hasNext()) { + if (makeDecimalOptional || addedAnyDecimalPoints || !nonLeadingZeroSeen || remainingPatternCanBeZeroLength() + || remainingPatternHasOnlyOneMoreElement()) { + nodes.add(new QuestionMarkNode()); + } + } else { + nodes.add(new QuestionMarkNode()); + } + nodes.add(copy(element)); + nodes.add(createRepetition(repetitionRange)); + if (questionMark != null) { + nodes.add(copy(questionMark)); + } + } else if (repetitionRange.getLeft() == 1 && repetitionRange.getRight() == 1) { + // The new repetition range is {1,1}. Another instance of the element can be added without a repetition after it. + nodes.add(createDecimalPoint()); + if (makeDecimalOptional) { + nodes.add(new QuestionMarkNode()); + } + nodes.add(copy(element)); + } else if (repetitionRange.getLeft() > 0 || repetitionRange.getRight() > 0) { + // The new repetition range is {x,y}. Add an instance of the element with the repetition after it. + nodes.add(createDecimalPoint()); + if (makeDecimalOptional) { + nodes.add(new QuestionMarkNode()); + } + nodes.add(copy(element)); + nodes.add(createRepetition(repetitionRange)); + if (questionMark != null) { + nodes.add(copy(questionMark)); + } + } else if (repetitionRange.getLeft() == 0 && repetitionRange.getRight() == 0) { + // The new repetition range is {0,0}. Do not add another instance of the element. If the remaining pattern cam be zero-length, or the first + // instance of the element was marked optional, make the decimal point optional. + if (iter.hasNext()) { + nodes.add(createDecimalPoint()); + if (makeDecimalOptional || remainingPatternCanBeZeroLength() || elementMarkedOptional) { + nodes.add(new QuestionMarkNode()); + } + } + } + if (nonLeadingZeroSeen) { + addMoreDecimalPoints = false; + } + return nodes; + } + + /** + * Add a copy of {@link #currentElement} to the node list. + */ + private void addCurrentElementToNodes() { + nodes.add(copy(currentElement)); + } + + /** + * Add a copy of {@link #currentQuantifier} to the node list if it is not null. + */ + private void addCurrentQuantifierToNodes() { + if (currentQuantifier != null) { + nodes.add(copy(currentQuantifier)); + } + } + + /** + * Add a copy of {@link #currentQuestionMark} to the node list if it is not null. + */ + private void addCurrentQuestionMarkToNodes() { + if (currentQuestionMark != null) { + nodes.add(copy(currentQuestionMark)); + } + } + + /** + * Add the current element, quantifier, and question mark to the node list. + */ + private void addAllCurrentToNodes() { + addCurrentElementToNodes(); + addCurrentQuantifierToNodes(); + addCurrentQuestionMarkToNodes(); + } + + /** + * Add a new {@code "\."} to the node list. + */ + private void addDecimalPointToNodes() { + nodes.add(createDecimalPoint()); + } + + /** + * Return a new escaped decimal point as a node. + */ + private Node createDecimalPoint() { + return new EscapedSingleCharNode(RegexConstants.PERIOD); + } + + /** + * Add a new {@code "?"} to the node list. + */ + private void addQuestionMarkToNodes() { + nodes.add(new QuestionMarkNode()); + } + + /** + * Return whether all remaining elements in the iterator can either occur zero times or match a zero. + * + * @return true if the remaining pattern can be zero-length, or false otherwise. + */ + private boolean remainingPatternCanBeZeroLength() { + // Mark the original index so that we can reset the iterator before exiting this method. + int originalIndex = iter.index(); + + // Seek past all zero-matching elements. + iter.seekPastZeroMatchingElements(); + + boolean canBeZeroLength = true; + while (iter.hasNext()) { + Node next = iter.next(); + // If the next element can match zero, it could be a trailing zero that would get trimmed from encoded numbers. + if (matchesZero(next)) { + iter.seekPastQuantifiers(); + iter.seekPastQuestionMarks(); + } else { + // If the next element cannot match zero, it could still occur zero times based on its quantifier (if present). + if (iter.hasNext() && iter.isNextQuantifier()) { + Node quantifier = iter.next(); + if (quantifier instanceof OneOrMoreNode) { + // If the element is followed by +, it must occur at least once. Remaining pattern cannot be zero-length. + canBeZeroLength = false; + break; + } else if (quantifier instanceof RepetitionNode) { + // If the remaining element is not followed by repetition variation of {0} or {0,x}, it cannot occur zero times. Remaining pattern + // cannot be zero-length. + if (!RegexUtils.repetitionCanOccurZeroTimes((RepetitionNode) quantifier)) { + canBeZeroLength = false; + break; + } + } + } else { + // If there is no quantifier, then the current element must occur. Remaining pattern cannot be zero-length. + canBeZeroLength = false; + break; + } + } + } + iter.setIndex(originalIndex); + return canBeZeroLength; + } + + /** + * Return whether only one more element (possibly quantified and/or optional) remains in the iterator. + */ + private boolean remainingPatternHasOnlyOneMoreElement() { + int originalIndex = iter.index(); + iter.next(); + iter.seekPastQuantifiers(); + iter.seekPastQuestionMarks(); + boolean hasOnlyOneMore = !iter.hasNext(); + iter.setIndex(originalIndex); + return hasOnlyOneMore; + } + + private boolean matchesZero(Node node) { + return RegexUtils.matchesChar(node, zeroChar); + } + + private boolean matchesZeroOnly(Node node) { + return RegexUtils.matchesCharOnly(node, zeroChar); + } + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/DecimalPointValidator.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/DecimalPointValidator.java new file mode 100644 index 00000000000..8254f34821a --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/DecimalPointValidator.java @@ -0,0 +1,58 @@ +package datawave.data.normalizer.regex.visitor; + +import datawave.data.normalizer.regex.AlternationNode; +import datawave.data.normalizer.regex.ExpressionNode; +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.NodeListIterator; +import datawave.data.normalizer.regex.RegexUtils; + +/** + * Implementation of {@link BaseVisitor} that accepts a {@link Node} tree and verifies that each alternated expression does not contain more than one decimal + * point. + */ +public class DecimalPointValidator extends BaseVisitor { + + public static void validate(Node node) { + if (node != null) { + DecimalPointValidator visitor = new DecimalPointValidator(); + node.accept(visitor, null); + } + } + + @Override + public Object visitExpression(ExpressionNode node, Object data) { + if (node.getFirstChild() instanceof AlternationNode) { + return super.visitExpression(node, data); + } else { + checkForInvalidDecimalPoints(node); + } + return null; + } + + /** + * Check the given expressions for valid decimal point specifications. + * + * @param node + * the node to validate + */ + private void checkForInvalidDecimalPoints(Node node) { + boolean decimalPointSeen = false; + NodeListIterator iter = node.getChildrenIterator(); + // Iterate through each element. + while (iter.hasNext()) { + // Get the next element. + Node next = iter.next(); + // If the current element is a decimal point, validate it. + if (RegexUtils.isDecimalPoint(next)) { + if (decimalPointSeen) { + throw new IllegalArgumentException("Regex may not contain expressions with than one decimal point."); + } else { + decimalPointSeen = true; + } + } + // Skip past any quantifiers or optionals if specified. + iter.seekPastQuantifiers(); + iter.seekPastQuantifiers(); + } + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/EmptyLeafTrimmer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/EmptyLeafTrimmer.java new file mode 100644 index 00000000000..0ef9cecbed9 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/EmptyLeafTrimmer.java @@ -0,0 +1,79 @@ +package datawave.data.normalizer.regex.visitor; + +import datawave.data.normalizer.regex.AlternationNode; +import datawave.data.normalizer.regex.EmptyNode; +import datawave.data.normalizer.regex.ExpressionNode; +import datawave.data.normalizer.regex.GroupNode; +import datawave.data.normalizer.regex.Node; + +/** + * Implementation of {@link CopyVisitor} that will return a copy of the tree trimmed such that the following modifications are made to it: + *
    + *
  • Remove all {@link EmptyNode} instances.
  • + *
  • Remove all {@link GroupNode} instances that subsequently have no children.
  • + *
  • Remove all {@link AlternationNode} instances that subsequently have one or no children. In the case of one child, the child will replace the + * {@link AlternationNode}.
  • + *
  • Remove all {@link ExpressionNode} instances that subsequently have an {@link ExpressionNode} child.
  • + *
+ * See the following examples: + *
    + *
  • Input {@code "3||4||5"} will return {@code "3|4|5"}
  • + *
  • Input {@code "3|()"} will return {@code "3"}
  • + *
  • Input {@code "()|()"} will return {@code null}
  • + *
+ */ +public class EmptyLeafTrimmer extends CopyVisitor { + + /** + * Return a copy of the given tree trimmed of empty nodes. If the entire tree is trimmed, null will be returned, otherwise a {@link ExpressionNode} with the + * trimmed tree will be returned. + * + * @param node + * the node to trim + * @return the trimmed node + */ + public static Node trim(Node node) { + if (node == null) { + return null; + } + EmptyLeafTrimmer visitor = new EmptyLeafTrimmer(); + return (Node) node.accept(visitor, null); + } + + @Override + public Object visitExpression(ExpressionNode node, Object data) { + Node copy = (Node) super.visitExpression(node, data); + if (copy.isLeaf()) { + return null; + } else if (copy.getChildCount() == 1) { + Node child = copy.getFirstChild(); + if (child instanceof ExpressionNode) { + return child; + } + } + return copy; + } + + @Override + public Object visitAlternation(AlternationNode node, Object data) { + Node copy = (Node) super.visitAlternation(node, data); + if (copy.isLeaf()) { + return null; + } else if (copy.getChildCount() == 1) { + return copy.getFirstChild(); + } else { + return copy; + } + } + + @Override + public Object visitGroup(GroupNode node, Object data) { + Node copy = (Node) super.visitGroup(node, data); + return copy.isLeaf() ? null : copy; + } + + @Override + public Object visitEmpty(EmptyNode node, Object data) { + return null; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/EqualityVisitor.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/EqualityVisitor.java new file mode 100644 index 00000000000..b3291346593 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/EqualityVisitor.java @@ -0,0 +1,169 @@ +package datawave.data.normalizer.regex.visitor; + +import datawave.data.normalizer.regex.AlternationNode; +import datawave.data.normalizer.regex.AnyCharNode; +import datawave.data.normalizer.regex.CharClassNode; +import datawave.data.normalizer.regex.CharRangeNode; +import datawave.data.normalizer.regex.DigitCharClassNode; +import datawave.data.normalizer.regex.EmptyNode; +import datawave.data.normalizer.regex.EncodedNumberNode; +import datawave.data.normalizer.regex.EncodedPatternNode; +import datawave.data.normalizer.regex.EndAnchorNode; +import datawave.data.normalizer.regex.EscapedSingleCharNode; +import datawave.data.normalizer.regex.ExpressionNode; +import datawave.data.normalizer.regex.GroupNode; +import datawave.data.normalizer.regex.IntegerNode; +import datawave.data.normalizer.regex.IntegerRangeNode; +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.OneOrMoreNode; +import datawave.data.normalizer.regex.QuestionMarkNode; +import datawave.data.normalizer.regex.RepetitionNode; +import datawave.data.normalizer.regex.SingleCharNode; +import datawave.data.normalizer.regex.StartAnchorNode; +import datawave.data.normalizer.regex.ZeroOrMoreNode; + +/** + * A {@link Visitor} implementation that will compare two {@link Node} tree and determine if they are equal. + */ +public class EqualityVisitor implements Visitor { + + /** + * Return whether the given {@link Node} trees are equal. + * + * @param left + * the left tree to compare + * @param right + * the right tree to compare + * @return true if the trees are equal, or false otherwise. + */ + public static boolean isEqual(Node left, Node right) { + if (left != null && right != null) { + EqualityVisitor visitor = new EqualityVisitor(); + return (boolean) left.accept(visitor, right); + } else { + return left == null && right == null; + } + } + + private boolean isEqual(Node left, Object data) { + Node right = (Node) data; + // Compare the nodes. + if (!left.equals(right)) { + return false; + } + // Compare the child counts. + if (left.getChildCount() != right.getChildCount()) { + return false; + } + // Compare the children. + for (int index = 0; index < left.getChildCount(); index++) { + Node leftChild = left.getChildAt(index); + Node rightChild = right.getChildAt(index); + boolean isEqual = (boolean) leftChild.accept(this, rightChild); + if (!isEqual) { + return false; + } + } + return true; + } + + @Override + public Object visitExpression(ExpressionNode node, Object data) { + return isEqual(node, data); + } + + @Override + public Object visitAlternation(AlternationNode node, Object data) { + return isEqual(node, data); + } + + @Override + public Object visitGroup(GroupNode node, Object data) { + return isEqual(node, data); + } + + @Override + public Object visitDigitChar(DigitCharClassNode node, Object data) { + return isEqual(node, data); + } + + @Override + public Object visitCharClass(CharClassNode node, Object data) { + return isEqual(node, data); + } + + @Override + public Object visitCharRange(CharRangeNode node, Object data) { + return isEqual(node, data); + } + + @Override + public Object visitSingleChar(SingleCharNode node, Object data) { + return isEqual(node, data); + } + + @Override + public Object visitEscapedSingleChar(EscapedSingleCharNode node, Object data) { + return isEqual(node, data); + } + + @Override + public Object visitRepetition(RepetitionNode node, Object data) { + return isEqual(node, data); + } + + @Override + public Object visitQuestionMark(QuestionMarkNode node, Object data) { + return isEqual(node, data); + } + + @Override + public Object visitAnyChar(AnyCharNode node, Object data) { + return isEqual(node, data); + } + + @Override + public Object visitZeroToMany(ZeroOrMoreNode node, Object data) { + return isEqual(node, data); + } + + @Override + public Object visitOneToMany(OneOrMoreNode node, Object data) { + return isEqual(node, data); + } + + @Override + public Object visitInteger(IntegerNode node, Object data) { + return isEqual(node, data); + } + + @Override + public Object visitIntegerRange(IntegerRangeNode node, Object data) { + return isEqual(node, data); + } + + @Override + public Object visitEmpty(EmptyNode node, Object data) { + return isEqual(node, data); + } + + @Override + public Object visitStartAnchor(StartAnchorNode node, Object data) { + return isEqual(node, data); + } + + @Override + public Object visitEndAnchor(EndAnchorNode node, Object data) { + return isEqual(node, data); + } + + @Override + public Object visitEncodedNumber(EncodedNumberNode node, Object data) { + return isEqual(node, data); + } + + @Override + public Object visitEncodedPattern(EncodedPatternNode node, Object data) { + return isEqual(node, data); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ExponentialBinAdder.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ExponentialBinAdder.java new file mode 100644 index 00000000000..0884988f23e --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ExponentialBinAdder.java @@ -0,0 +1,154 @@ +package datawave.data.normalizer.regex.visitor; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.Function; + +import org.apache.commons.lang3.tuple.Pair; + +import datawave.data.normalizer.regex.CharClassNode; +import datawave.data.normalizer.regex.CharRangeNode; +import datawave.data.normalizer.regex.EncodedPatternNode; +import datawave.data.normalizer.regex.EscapedSingleCharNode; +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.RegexConstants; +import datawave.data.normalizer.regex.RegexUtils; +import datawave.data.normalizer.regex.SingleCharNode; +import datawave.data.type.util.NumericalEncoder; + +/** + * Implementation of {@link CopyVisitor} that will return a copy of the tree where all non-simple number patterns are enriched with bin information. + */ +public class ExponentialBinAdder extends SubExpressionVisitor { + + /** + * Return a copy of the given tree with all regex patterns enriched with exponential bin information. + * + * @param node + * the node + * @return the enriched node + */ + public static Node addBins(Node node) { + if (node == null) { + return null; + } + ExponentialBinAdder visitor = new ExponentialBinAdder(); + return (Node) node.accept(visitor, null); + } + + // Retrieves bins for negative numbers. + private static final Function NEGATIVE_BIN_FUNCTION = NumericalEncoder::getNegativeBin; + + // Retrieves bins for positive numbers. + private static final Function POSITIVE_BIN_FUNCTION = NumericalEncoder::getPositiveBin; + + @Override + protected Object visitSubExpression(Node node) { + List binNodes = new ArrayList<>(); + boolean negative = RegexUtils.isNegativeRegex(node); + + // The bin information consist of: + // 1. The lead sign that indicates whether the range covers positive (\+) or negative numbers (!). + binNodes.add(getLeadSign(negative)); + // 2. The range of exponential bin letters. This may either be a single bin letter, or a character class of multiple bin letters. + binNodes.add(getBinRange(node, negative)); + // 3. An 'E' to separate the bin information from the beginning of the numeric regex pattern. + binNodes.add(new SingleCharNode(RegexConstants.CAPITAL_E)); + + // Return an EncodedPatternNode copy rather than an ExpressionNode. + EncodedPatternNode encodedPattern = new EncodedPatternNode(copy(node).getChildren()); + + // If we had a negative sign, remove it. We will have ! (negative) and \+ (positive) going forward. + if (negative) { + encodedPattern.removeFirstChild(); + } + + // Insert the bin information at the beginning of the pattern. + int insertIndex = 0; + for (Node binNode : binNodes) { + encodedPattern.addChild(binNode, insertIndex); + insertIndex++; + } + return encodedPattern; + } + + /** + * Return {@code "\+"} if negative is false, or {@code "!"} if negative is true. + * + * @param negative + * whether the regex pattern matches against negative numbers. + * @return the lead sign + */ + private Node getLeadSign(boolean negative) { + return negative ? new SingleCharNode(RegexConstants.EXCLAMATION_POINT) : new EscapedSingleCharNode(RegexConstants.PLUS); + } + + /** + * Get the range of exponential bins that the regex pattern should cover. + * + * @param node + * the regex pattern + * @param negative + * whether the pattern matches against negative numbers + * @return the bin range, either a single bin letter or a character class of bin ranges + */ + private Node getBinRange(Node node, boolean negative) { + // Determine what exponential bins should be included in the encoded expression. + // Get the bin range for numbers equal to or greater than one that the pattern can match against. + Pair gteOneBinRange = GTEOneBinFinder.binRangeOf(node); + // Get the bin range for numbers less than one that the pattern can match against. + Pair ltOneBinRange = LTOneBinFinder.binRangeOf(node); + + // The target bin retrieval function depends on whether the pattern matches against negative numbers. + Function binFunction = negative ? NEGATIVE_BIN_FUNCTION : POSITIVE_BIN_FUNCTION; + + if (gteOneBinRange == null) { + // If the regex pattern cannot match against numbers equal to or greater than one, return the bin info for numbers less than one only. + return buildBinFromSingleRange(ltOneBinRange, binFunction); + } else if (ltOneBinRange == null) { + // If the regex pattern cannot match against numbers less than one, return the bin info for numbers equal to or greater than one only. + return buildBinFromSingleRange(gteOneBinRange, binFunction); + } else { + // Otherwise, merge the bin ranges and return them. + CharClassNode charClass = new CharClassNode(); + Node onePlusBin = buildBinFromSingleRange(gteOneBinRange, binFunction); + // If a single character was returned, add it to the character class. Otherwise, a character class with a range was returned. Add the range. + charClass.addChild(onePlusBin instanceof SingleCharNode ? onePlusBin : onePlusBin.getFirstChild()); + + Node subOneBin = buildBinFromSingleRange(ltOneBinRange, binFunction); + // If a single character was returned, add it to the character class. Otherwise, a character class with a range was returned. Add the range. + charClass.addChild(subOneBin instanceof SingleCharNode ? subOneBin : subOneBin.getFirstChild()); + return charClass; + } + } + + /** + * Return a bin info node for a single bin range. + * + * @param binRange + * the + * @param binFunction + * the delegate bin retrieval function + * @return the bin info + */ + private Node buildBinFromSingleRange(Pair binRange, Function binFunction) { + if (binRange.getLeft().equals(binRange.getRight())) { + // We have a single bin to cover in this range. Return a single char node. + return new SingleCharNode(binFunction.apply(binRange.getLeft())); + } else { + // We have a range of bins to cover. Create a character class. + CharClassNode charClass = new CharClassNode(); + char left = binFunction.apply(binRange.getLeft()); + char right = binFunction.apply(binRange.getRight()); + int compare = Character.compare(left, right); + // It's possible for the left sided-bin to be alphabetically higher than the right side. If so, flip them around in the character class range. + if (compare < 0) { + charClass.addChild(new CharRangeNode(left, right)); + } else { + charClass.addChild(new CharRangeNode(right, left)); + } + return charClass; + } + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/GTEOneBinFinder.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/GTEOneBinFinder.java new file mode 100644 index 00000000000..1548890fecf --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/GTEOneBinFinder.java @@ -0,0 +1,143 @@ +package datawave.data.normalizer.regex.visitor; + +import org.apache.commons.lang3.tuple.Pair; + +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.NodeType; +import datawave.data.normalizer.regex.RegexUtils; + +/** + * Implementation of {@link BinFinder} that finds the range of exponential bins that a regex pattern should match against for numbers equal to or greater than + * one. + */ +public class GTEOneBinFinder extends BinFinder { + + private static final int MIN_BIN = 0; + private static final int MAX_BIN = 25; + private static final int INITIAL_ENDPOINT_VALUE = -1; + + public static Pair binRangeOf(Node node) { + GTEOneBinFinder calculator = new GTEOneBinFinder(node); + return calculator.getBinRange(); + } + + protected GTEOneBinFinder(Node node) { + super(node, MIN_BIN, MAX_BIN, INITIAL_ENDPOINT_VALUE); + } + + @Override + protected Pair getBinRange() { + calculateRange(); + normalizeRange(); + return getEndpoints(); + } + + /** + * Calculate the bin range. + */ + private void calculateRange() { + // Skip any leading zero elements that only match a zero character. + childrenIter.seekPastZeroOnlyElements(); + + // If a decimal point is present, and we have reached it after skipping zero-only elements, there's nothing further to do. + if (childrenIter.index() == decimalPointIndex) { + return; + } + + boolean lockedAtWildcard = false; + boolean nonLeadingZeroSeen = false; + + // Iterate through the remaining children up to the decimal point (if present). + while (childrenIter.hasNext() && !(childrenIter.index() == decimalPointIndex)) { + Node next = childrenIter.next(); + if (lockedAtWildcard) { + // If we have previously locked the lower bound at a wildcard, we do not need to make further evaluations on the current element. Update the + // bin range with it. + updateBinRange(); + } else if (nonLeadingZeroSeen) { + // If the current node is a wildcard, and an explicit decimal point is not present in the regex, lock the lower bound. This will ensure we match + // against numbers that had a decimal point that would match against this wildcard. + if (decimalPointIndex == -1 && next.getType() == NodeType.ANY_CHAR) { + lockLower(); + lockedAtWildcard = true; + } + // If any non-leading zero elements were seen, update the bin range with the current element. We must still check for a wildcard. + updateBinRange(); + } else if (RegexUtils.matchesZeroOnly(next)) { + // The current element matches zero only, e.g. '0' or [0], and is part of a leading zero. Update the bin range with the current element. + updateBinRange(); + } else if (RegexUtils.matchesZero(next)) { + // The current element can match zero and at least one other number. Reset the lower bound, and seek ahead to determine if we should lock the + // lower bound. + setLowerToInitialEndpointValue(); + // If this leading zero is the last element that can match against any other number until the end of the regex, or until the decimal point, we + // must lock the lower bound here. + if (isRemainingZeroOnlyUntilEndOrDecimalPoint()) { + // The current element must occur at least once, so increment lower by one before locking it. + incrementLower(); + + // We want to update the bin range without modifying the lower bound, so lock the lower bound, update the bin range, and then unlock the + // lower bound. The lower bound must be unlocked afterwards to allow for any subsequent zero-only characters to be counted if seen. + lockLower(); + updateBinRange(); + unlockLower(); + } else { + // Update the bin range. + updateBinRange(); + } + } else { + // We've seen our first non-leading zero. Mark it so. + nonLeadingZeroSeen = true; + // Reset the lower bound before updating the bin range. Any elements we saw before this were leading zeros that can be disregarded. + setLowerToInitialEndpointValue(); + updateBinRange(); + } + } + } + + /** + * Return whether, if skipping all elements that can only match zero, there are no more elements or the next element is a decimal point. + * + * @return true if the remaining regex pattern will match zero either until the end or a decimal point, or false otherwise + */ + private boolean isRemainingZeroOnlyUntilEndOrDecimalPoint() { + // Make a note of the iterator's current index so that we can reset it later. + int originalIndex = childrenIter.index(); + + // Skip past any quantifiers or question marks the current element may have had. + childrenIter.seekPastQuantifiers(); + childrenIter.seekPastQuestionMarks(); + + // Find the next node that does not only match the character '0'. + Node nextNonZeroOnlyNode = null; + while (childrenIter.hasNext()) { + Node next = childrenIter.next(); + + // If the current element does not match zero only, we've found our target node. Stop looping. + if (!RegexUtils.matchesZeroOnly(next)) { + nextNonZeroOnlyNode = next; + break; + } + childrenIter.seekPastQuantifiers(); + childrenIter.seekPastQuestionMarks(); + } + // Reset the iterator to the original index. + childrenIter.setIndex(originalIndex); + + return nextNonZeroOnlyNode == null || RegexUtils.isDecimalPoint(nextNonZeroOnlyNode); + } + + /** + * Update the bin range with the current element, taking into account any specified quantifiers. + */ + private void updateBinRange() { + if (childrenIter.hasNext() && childrenIter.isNextQuantifier()) { + // If a quantifier was specified, increment the upper and lower bound based on the quantifier type. + updateRangeWithNextQuantifier(); + } else { + // If no quantifier was specified, increment the upper and lower bound by one. + incrementUpper(); + incrementLower(); + } + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/LTOneBinFinder.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/LTOneBinFinder.java new file mode 100644 index 00000000000..71496d08de4 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/LTOneBinFinder.java @@ -0,0 +1,143 @@ +package datawave.data.normalizer.regex.visitor; + +import org.apache.commons.lang3.tuple.Pair; + +import datawave.data.normalizer.regex.AnyCharNode; +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.RegexUtils; + +/** + * Implementation of {@link BinFinder} that finds the range of exponential bins that a regex pattern should match against for numbers less than one. + */ +public class LTOneBinFinder extends BinFinder { + + private static final int MAX_BIN = 26; + private static final int MIN_BIN = 1; + private static final int INITIAL_ENDPOINT_VALUE = 0; + + public static Pair binRangeOf(Node node) { + LTOneBinFinder calculator = new LTOneBinFinder(node); + return calculator.getBinRange(); + } + + protected LTOneBinFinder(Node node) { + super(node, MIN_BIN, MAX_BIN, INITIAL_ENDPOINT_VALUE); + } + + @Override + protected Pair getBinRange() { + if (decimalPointIndex == -1) { + calculateRangeWithoutDecimalPoint(); + } else { + calculateRangeWithDecimalPoint(); + } + normalizeRange(); + + // When retrieving bins for numbers less than one, the bin values must be negative. Negate the endpoints. + lower = -lower; + upper = -upper; + + return getEndpoints(); + } + + /** + * Calculate the bin range for a pattern that has no decimal point specified in it. + */ + private void calculateRangeWithoutDecimalPoint() { + // Get the index of the first wildcard in the regex, if present. + int firstWildcardIndex = node.indexOf(AnyCharNode.class); + + // If there is no wildcard present in the regex, the regex does not need a bin range for numbers less than one. + if (firstWildcardIndex == -1) { + return; + } + + // If there are any elements before the wildcard, they must all be able to possibly be a leading zero up to the wildcard. If not, the pattern will not + // match against numbers less than one and does not need a bin range for numbers less than one. + while (childrenIter.index() != firstWildcardIndex) { + Node next = childrenIter.peekNext(); + // We found an element that cannot match zero before the wildcard. Return early. + if (!RegexUtils.matchesZero(next)) { + return; + } else { + // We found an element that can match zero. Move the iterator forward, and skip any quantifiers or question marks. + childrenIter.next(); + childrenIter.seekPastQuantifiers(); + childrenIter.seekPastQuestionMarks(); + } + } + + // Skip over the first wildcard, capture any quantifier if present, and skip past any question marks. + childrenIter.next(); + Node quantifier = childrenIter.isNextQuantifier() ? childrenIter.next() : null; + childrenIter.seekPastQuestionMarks(); + + // If there are no elements after the wildcard, and the wildcard did not have a quantifier, there is nothing more to do. + if (!childrenIter.hasNext() && quantifier == null) { + return; + } + + // Otherwise we will at least have the minimum bin range possible. + incrementLower(); + incrementUpper(); + + // If the first wildcard had a quantifier, lock the lower bound and update the upper bound based on the quantifier. + if (quantifier != null) { + lockLower(); + updateRangeWithQuantifier(quantifier); + } + + // Process the remaining children. + processRemainingChildren(); + } + + /** + * Calculate the bin range for a pattern with a decimal point in it. + */ + private void calculateRangeWithDecimalPoint() { + // Seek past children that can match the character '0'. If the next child after this is not the decimal point, then the regex expression will not + // match against numbers less than one. + childrenIter.seekPastZeroMatchingElements(); + if (childrenIter.index() != decimalPointIndex) { + return; + } + + // Skip over the decimal point to the next character. + childrenIter.next(); + // We will at least have the minimum bin range possible. + incrementUpper(); + incrementLower(); + + // Process the remaining children. + processRemainingChildren(); + } + + /** + * Iterate over the remaining children in the children iterator and update the bin range. + */ + private void processRemainingChildren() { + // For each possible leading zero after the decimal point, update the bin range. + while (childrenIter.hasNext()) { + Node next = childrenIter.next(); + // If next can be a leading zero, update the range. + if (RegexUtils.matchesZero(next)) { + // If next can possible be not a zero, lock the lower bound. + if (!RegexUtils.matchesZeroOnly(next)) { + lockLower(); + } + + // If the element has a quantifier, increment the upper and lower bound based on the quantifier. + if (childrenIter.isNextQuantifier()) { + updateRangeWithNextQuantifier(); + } else { + // Otherwise increment the upper and lower bound by one. + incrementLower(); + incrementUpper(); + } + } else { + // If next cannot possibly be a leading zero, there is nothing more to do. + return; + } + } + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NegativeNumberPatternInverter.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NegativeNumberPatternInverter.java new file mode 100644 index 00000000000..3f1d7c0ff12 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NegativeNumberPatternInverter.java @@ -0,0 +1,567 @@ +package datawave.data.normalizer.regex.visitor; + +import static datawave.data.normalizer.regex.RegexUtils.toChar; +import static datawave.data.normalizer.regex.RegexUtils.toInt; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import datawave.data.normalizer.regex.AlternationNode; +import datawave.data.normalizer.regex.CharClassNode; +import datawave.data.normalizer.regex.CharRangeNode; +import datawave.data.normalizer.regex.EncodedPatternNode; +import datawave.data.normalizer.regex.ExpressionNode; +import datawave.data.normalizer.regex.GroupNode; +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.NodeListIterator; +import datawave.data.normalizer.regex.NodeType; +import datawave.data.normalizer.regex.OneOrMoreNode; +import datawave.data.normalizer.regex.RegexConstants; +import datawave.data.normalizer.regex.RegexUtils; +import datawave.data.normalizer.regex.RepetitionNode; +import datawave.data.normalizer.regex.SingleCharNode; +import datawave.data.normalizer.regex.ZeroOrMoreNode; + +/** + * Implementation of {@link CopyVisitor} that will return a copy of a regex tree with all patterns that are meant to match negative numbers inverted such that + * they will match against negative numbers that were encoded by {@link datawave.data.type.util.NumericalEncoder}. The numerical encoder encodes negative + * numbers such that the mantissa equals ten minus the mantissa of scientific notation. + * + * @see datawave.data.type.util.NumericalEncoder + */ +public class NegativeNumberPatternInverter extends CopyVisitor { + + private static final int TEN = 10; + private static final int NINE = 9; + + public static Node invert(Node node) { + if (node == null) { + return null; + } + + NegativeNumberPatternInverter visitor = new NegativeNumberPatternInverter(); + return (Node) node.accept(visitor, null); + } + + @Override + public Object visitEncodedPattern(EncodedPatternNode node, Object data) { + // Operate on a copy of the pattern tree. + Node copy = copy(node); + + // If the first character is not !, this is not a negative number pattern. Return the copy. + if (!RegexUtils.isChar(copy.getFirstChild(), RegexConstants.EXCLAMATION_POINT)) { + return copy; + } + + // Create an initial encoded pattern node with all the leading bin info. + EncodedPatternNode encodedPattern = new EncodedPatternNode(); + List children = copy.getChildren(); + int startOfNodesToInvert = 0; + for (Node child : children) { + startOfNodesToInvert++; + encodedPattern.addChild(child); + if (RegexUtils.isChar(child, RegexConstants.CAPITAL_E)) { + break; + } + } + + // Invert the remaining nodes and add them to the encoded pattern node. + List nodesToInvert = new ArrayList<>(children.subList(startOfNodesToInvert, children.size())); + encodedPattern.addChildren(new PatternInverter(nodesToInvert).invert()); + return encodedPattern; + } + + private static class PatternInverter { + + // The node iterator. + protected final NodeListIterator iter; + + // The currently inverted nodes. + protected final List inverted = new ArrayList<>(); + + // The most recent element. + protected Node currentElement; + + // The most recent quantifier. + protected Node currentQuantifier; + + // The most recent question mark. + protected Node currentQuestionMark; + + public PatternInverter(List nodes) { + Collections.reverse(nodes); + this.iter = new NodeListIterator(nodes); + } + + public List invert() { + invertEndingPermutations(); + while (iter.hasNext()) { + captureNext(); + inverted.addAll(subtractCurrentFromNine(false)); + } + Collections.reverse(inverted); + return inverted; + } + + private void invertEndingPermutations() { + // Fetch the first element. + captureNext(); + + // If the first element can occur zero times, e.g. it could match the '0' character (which would not show up in an encoded number), or it has a + // quantifier that allows for zero occurrences, e.g. {0,4}, then we must identify all possible trailing elements that may not occur, and create + // ending permutations that allow for the possibility of each successive element not occurring. The last element of each permutation must be + // inverted with a minuend of 10, and any preceding elements must be inverted with a minuend of 9. + if (currentCanOccurZeroTimes()) { + List> permutations = new ArrayList<>(); + // Add a permutation of the first element inverted with a minuend of 10. + permutations.add(subtractCurrentFromTen()); + // Examine all remaining elements until we find one that must occur at least once. + while (iter.hasNext()) { + captureNext(); + // Add a variant of the current element inverted with a minuend of 9 to all existing permutations. + List subtractedFromNine = subtractCurrentFromNine(true); + for (List permutation : permutations) { + permutation.addAll(subtractedFromNine); + } + // If the current element does not match only the '0' character, add a new permutation with a variant of the current element inverted with a + // minuend of 10. + if (!currentMatchesZeroOnly()) { + permutations.add(0, subtractCurrentFromTen()); + } + if (!currentCanOccurZeroTimes()) { + break; + } + } + if (permutations.size() == 1) { + // If we only have one permutation, the pattern was only one element long, e.g. "\d". Add the sole permutation to the inverted nodes list. + inverted.addAll(permutations.get(0)); + } else { + // If we have multiple permutations, we need to create alternations of these permutations, and wrap them in a group. + // Sort the alternations from shortest to longest. + AlternationNode alternation = new AlternationNode(); + for (List permutation : permutations) { + // Reverse the nodes in the permutation to restore the correct order. + Collections.reverse(permutation); + // Add the permutation as an expression to the alternation node. + alternation.addChild(new ExpressionNode(permutation)); + } + // Wrap the alternation in a group before adding it to the inverted nodes list. + inverted.add(new GroupNode(alternation)); + } + + } else { + // The last-most element must occur at least once, and cannot match the character '0'. Invert it with a minuend of 10, and add it to the + // inverted nodes list. + inverted.addAll(subtractCurrentFromTen()); + } + } + + /** + * Return whether the current element represents something that may match against a trailing zero, or may occur zero times. + * + * @return whether the current element could occur zero times in target matches + */ + private boolean currentCanOccurZeroTimes() { + if (currentElement.getType() != NodeType.GROUP) { + return RegexUtils.matchesZero(currentElement) || (currentQuantifier != null && RegexUtils.canOccurZeroTimes(currentQuantifier)); + } else { + NodeListIterator groupIter = currentElement.getChildrenIterator(); + Node targetElement = groupIter.next(); + if (RegexUtils.matchesZero(targetElement)) { + return true; + } else { + if (groupIter.isNextQuantifier()) { + return RegexUtils.canOccurZeroTimes(groupIter.next()); + } + return false; + } + } + } + + private boolean currentMatchesZeroOnly() { + if (currentElement.getType() != NodeType.GROUP) { + return RegexUtils.matchesZeroOnly(currentElement); + } else { + return RegexUtils.matchesZeroOnly(currentElement.getFirstChild()); + } + } + + /** + * Return the current element inverted with a minuend of 10. + * + * @return the inverted nodes. + */ + private List subtractCurrentFromTen() { + return ElementInverter.forType(currentElement).subtractFromTen(currentElement, currentQuantifier, currentQuestionMark, true); + } + + /** + * Return the current element inverted with a minuend of 9. + * + * @param endingElement + * whether the current element is an ending permutation element + * @return the inverted nodes + */ + private List subtractCurrentFromNine(boolean endingElement) { + return ElementInverter.forType(currentElement).subtractFromNine(currentElement, currentQuantifier, currentQuestionMark, endingElement); + } + + /** + * Capture the next element, quantifier, and current question mark. + */ + protected void captureNext() { + // Reset the current elements to null. + setCurrentToNull(); + + // Extract the next element, quantifier, and question mark if present. + while (iter.hasNext()) { + if (iter.isNextQuestionMark()) { + currentQuestionMark = iter.next(); + } else if (iter.isNextQuantifier()) { + currentQuantifier = iter.next(); + } else { + currentElement = iter.next(); + break; + } + } + } + + /** + * Set the current element, quantifier, and question mark to null. + */ + protected void setCurrentToNull() { + currentElement = null; + currentQuantifier = null; + currentQuestionMark = null; + } + } + + private interface ElementInverter { + + ElementInverter NON_MODIFYING_INVERTER = new NonModifyingInverter(); + ElementInverter SINGLE_CHAR_INVERTER = new SingleCharInverter(); + ElementInverter CHAR_CLASS_INVERTER = new CharClassInverter(); + ElementInverter GROUP_INVERTER = new GroupInverter(); + + /** + * Return the appropriate {@link ElementInverter} for the element's type. + * + * @param element + * the element + * @return the inverter + */ + static ElementInverter forType(Node element) { + switch (element.getType()) { + case ESCAPED_SINGLE_CHAR: + case ANY_CHAR: + case DIGIT_CHAR_CLASS: + return NON_MODIFYING_INVERTER; + case SINGLE_CHAR: + return SINGLE_CHAR_INVERTER; + case CHAR_CLASS: + return CHAR_CLASS_INVERTER; + case GROUP: + return GROUP_INVERTER; + default: + throw new IllegalArgumentException("Unhandled element type " + element.getType()); + } + } + + List subtractFromNine(Node element, Node quantifier, Node questionMark, boolean endingElement); + + List subtractFromTen(Node element, Node quantifier, Node questionMark, boolean endingElement); + } + + /** + * Abstract implementation of {@link ElementInverter} with some shared functionality. + */ + private static abstract class AbstractInverter implements ElementInverter { + + protected List asList(Node... nodes) { + List list = new ArrayList<>(); + for (Node node : nodes) { + if (node != null) { + list.add(node); + } + } + return list; + } + + protected SingleCharNode subtractSingleCharFrom(SingleCharNode node, int minuend) { + char digit = node.getCharacter(); + int value = minuend - RegexUtils.toInt(digit); + return value < 10 ? new SingleCharNode(RegexUtils.toChar(value)) : null; + } + + } + + /** + * Handles elements that do not need to go through inversion, like wildcards or the digit character class {@code \d}. + */ + private static class NonModifyingInverter extends AbstractInverter { + + @Override + public List subtractFromNine(Node element, Node quantifier, Node questionMark, boolean endingElement) { + // If this is an ending permutation element, and the element is marked optional, make it non-optional. + if (endingElement && quantifier == null && questionMark != null) { + return asList(element); + } + // Return the elements in reverse order. + return asList(questionMark, quantifier, element); + } + + @Override + public List subtractFromTen(Node element, Node quantifier, Node questionMark, boolean endingElement) { + // If this is an ending permutation element, and the element is marked optional, make it non-optional. + if (endingElement && quantifier == null && questionMark != null) { + return asList(element); + } + // If the quantifier is *, change it to + to require at least one occurrence. + if (quantifier != null && quantifier.getType() == NodeType.ZERO_OR_MORE) { + quantifier = new OneOrMoreNode(); + } + // Return the elements in reverse order. + return asList(questionMark, quantifier, element); + } + + } + + /** + * Handles inverting single characters. + */ + private static class SingleCharInverter extends AbstractInverter { + + /** + * Return the given element inverted with a minuend of nine. + */ + @Override + public List subtractFromNine(Node element, Node quantifier, Node questionMark, boolean endingElement) { + // Subtract the given digit char from 9. + Node newElement = subtractSingleCharFrom((SingleCharNode) element, NINE); + // If this is an ending permutation element, and the element is marked optional, make it non-optional. + if (endingElement && quantifier == null && questionMark != null) { + return asList(newElement); + } + // Return the elements in reverse order. + return asList(questionMark, quantifier, newElement); + } + + /** + * Return the given char inverted with a minuend of ten. + */ + @Override + public List subtractFromTen(Node element, Node quantifier, Node questionMark, boolean endingElement) { + Node fromTen = subtractSingleCharFrom((SingleCharNode) element, TEN); + // If the element does not have a quantifier, return the question mark and element in reverse order. + if (quantifier == null) { + // If this is an ending permutation element, and the element is marked optional, make it non-optional. + if (endingElement && questionMark != null) { + return asList(fromTen); + } else { + return asList(questionMark, fromTen); + } + } else { + // If the element has a quantifier, we must precede the version of the element subtracted from 10 with a version of the element subtracted from + // 9, and followed by the quantifier with one fewer occurrence. + Node fromNine = subtractSingleCharFrom((SingleCharNode) element, NINE); + switch (quantifier.getType()) { + case ZERO_OR_MORE: + case ONE_OR_MORE: + // The new quantifier should be *. Return the elements in reverse order. + return asList(fromTen, questionMark, new ZeroOrMoreNode(), fromNine); + case REPETITION: + // Get the repetition as a range, and subtract 1 from it. + Pair range = RegexUtils.getRepetitionAsRange((RepetitionNode) quantifier); + range = RegexUtils.subtractOneFrom(range); + if (range.getRight() == null) { + // The new range is {x,}. Create a new repetition from the range and use that. + RepetitionNode fromNineQuantifier = RegexUtils.createRepetition(range); + return asList(fromTen, questionMark, fromNineQuantifier, fromNine); + } else if (range.getLeft() == 0 && range.getRight() == 0) { + // The new range is {0,0}, so zero occurrences. Do not include a version of the element subtracted from 9. + return asList(fromTen); + } else if (range.getLeft() == 1 && range.getRight() == 1) { + // The new range is {1,1}, exactly one occurrence. Include a version of the element subtracted from 9, but do not include a + // quantifier. + return asList(fromTen, fromNine); + } else { + // The new range is {x,y}. Create a new repetition from the range and use that. + RepetitionNode fromNineQuantifier = RegexUtils.createRepetition(range); + return asList(fromTen, questionMark, fromNineQuantifier, fromNine); + } + default: + throw new IllegalArgumentException("Unhandled quantifier type " + quantifier.getType()); + } + } + } + } + + /** + * Handles inverting character classes. + */ + private static class CharClassInverter extends AbstractInverter { + + @Override + public List subtractFromNine(Node element, Node quantifier, Node questionMark, boolean endingElement) { + // Subtract each element in the character class from 9 and return the elements in reverse order. + Node newElement = subtractFrom((CharClassNode) element, NINE); + // If this is an ending permutation element, and the element is marked optional, make it non-optional. + if (endingElement && quantifier == null && questionMark != null) { + return asList(newElement); + } + return asList(questionMark, quantifier, newElement); + } + + @Override + public List subtractFromTen(Node element, Node quantifier, Node questionMark, boolean endingElement) { + Node fromTen = subtractFrom((CharClassNode) element, TEN); + // If the element does not have a quantifier, return the question mark and element in reverse order. + if (quantifier == null) { + // If this is an ending permutation element, and the element is marked optional, make it non-optional. + if (endingElement && questionMark != null) { + return asList(fromTen); + } + return asList(questionMark, fromTen); + } else { + // If the element has a quantifier, we must precede the version of the element subtracted from 10 with a version of the element subtracted from + // 9, and followed by the quantifier with one fewer occurrence. + Node fromNine = subtractFrom((CharClassNode) element, NINE); + switch (quantifier.getType()) { + case ZERO_OR_MORE: + case ONE_OR_MORE: + // The new quantifier should be *. Return the elements in reverse order. + return asList(fromTen, questionMark, new ZeroOrMoreNode(), fromNine); + case REPETITION: + // Get the repetition as a range, and subtract 1 from it. + Pair range = RegexUtils.getRepetitionAsRange((RepetitionNode) quantifier); + range = RegexUtils.subtractOneFrom(range); + if (range.getRight() == null) { + // The new range is {x,}. Create a new repetition from the range and use that. + RepetitionNode fromNineQuantifier = RegexUtils.createRepetition(range); + return asList(fromTen, questionMark, fromNineQuantifier, fromNine); + } else if (range.getLeft() == 0 && range.getRight() == 0) { + // The new range is {0,0}, so zero occurrences. Do not include a version of the element subtracted from 9. + return asList(fromTen); + } else if (range.getLeft() == 1 && range.getRight() == 1) { + // The new range is {1,1}, exactly one occurrence. Include a version of the element subtracted from 9, but do not include a + // quantifier. + return asList(fromTen, fromNine); + } else { + // The new range is {x,y}. Create a new repetition from the range and use that. + RepetitionNode fromNineQuantifier = RegexUtils.createRepetition(range); + return asList(fromTen, questionMark, fromNineQuantifier, fromNine); + } + default: + throw new IllegalArgumentException("Unhandled quantifier type " + quantifier.getType()); + } + } + } + + private Node subtractFrom(CharClassNode node, int minuend) { + List children = new ArrayList<>(); + for (Node child : node.getChildren()) { + // The child is a single char. + if (child instanceof SingleCharNode) { + // Invert the child as long as we are not trying to subtract 0 from 10. Otherwise, do not retain the child. + if (minuend != TEN || !RegexUtils.isChar(child, RegexConstants.ZERO)) { + children.add(subtractSingleCharFrom((SingleCharNode) child, minuend)); + } + } else { + // The child is a range. + CharRangeNode range = (CharRangeNode) child; + int rangeStart = toInt(range.getStart()); + // If the current minuend is 10 and the start of the range is 0, adjust the range to start from 1 instead so that we're not subtracting 0 + // from 10. + if (minuend == TEN && rangeStart == 0) { + rangeStart = 1; + } + int startValue = minuend - rangeStart; + int endValue = minuend - toInt(range.getEnd()); + // If the start value is equal to or less than the end value, return the range as (start-end). Otherwise, return the range as (end-start). + if (startValue <= endValue) { + children.add(new CharRangeNode(toChar(startValue), toChar(endValue))); + } else { + children.add(new CharRangeNode(toChar(endValue), toChar(startValue))); + } + } + } + // If after inverting the character class, we only have a single character in it, and the character class is not negated, return the single + // character rather than a character class. + if (children.size() == 1 && children.get(0).getType() == NodeType.SINGLE_CHAR && !node.isNegated()) { + return children.get(0); + } else { + // Otherwise, return a character class. Make a shallow copy in order to also copy over whether the char class is negated. + CharClassNode charClass = node.shallowCopy(); + charClass.addChildren(children); + return charClass; + } + } + } + + /** + * Handles inverting groups that were inserted into the pattern by {@link ZeroTrimmer}. + */ + private static class GroupInverter extends AbstractInverter { + + @Override + public List subtractFromNine(Node element, Node quantifier, Node questionMark, boolean endingElement) { + List children = invertGroup(element, NINE, endingElement); + // If this is an ending permutation element, return the group flattened. + if (endingElement) { + return children; + } else { + // Otherwise return a new group. + return createGroup(children, quantifier, questionMark); + } + } + + @Override + public List subtractFromTen(Node element, Node quantifier, Node questionMark, boolean endingElement) { + List children = invertGroup(element, TEN, endingElement); + // If this is an ending permutation element, and the element is marked optional, make it non-optional. + if (endingElement) { + return children; + } else { + // Otherwise return a new group. + return createGroup(children, quantifier, questionMark); + } + } + + // Return the children of the given group inverted. + private List invertGroup(Node group, int minuend, boolean endingElement) { + // Any group seen here was created by the ZeroTrimmer visitor, and will have at most one element, one quantifier, and one question mark. Fetch them + // from the group. + NodeListIterator iter = group.getChildrenIterator(); + Node element = iter.next(); + Node quantifier = iter.hasNext() && iter.isNextQuantifier() ? iter.next() : null; + Node questionMark = iter.hasNext() && iter.isNextQuestionMark() ? iter.next() : null; + + // Fetch the appropriate inverter for the element type. + ElementInverter inverter = ElementInverter.forType(element); + + // Invert the elements based on the minuend. + List inverted; + switch (minuend) { + case NINE: + inverted = inverter.subtractFromNine(element, quantifier, questionMark, endingElement); + break; + case TEN: + inverted = inverter.subtractFromTen(element, quantifier, questionMark, endingElement); + break; + default: + throw new IllegalArgumentException("Invalid minuend " + minuend); + } + + // Return the inverted nodes. We do not need to return them as groups, but can flatten it instead. + return inverted; + } + + private List createGroup(List children, Node quantifier, Node questionMark) { + Collections.reverse(children); + return asList(questionMark, quantifier, new GroupNode(children)); + } + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NegativeVariantExpander.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NegativeVariantExpander.java new file mode 100644 index 00000000000..b8e9c4e440e --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NegativeVariantExpander.java @@ -0,0 +1,64 @@ +package datawave.data.normalizer.regex.visitor; + +import datawave.data.normalizer.regex.AlternationNode; +import datawave.data.normalizer.regex.AnyCharNode; +import datawave.data.normalizer.regex.ExpressionNode; +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.RegexConstants; +import datawave.data.normalizer.regex.SingleCharNode; + +/** + * Implementation of {@link CopyVisitor} that expands all regex expressions with negative variants of sub-expressions where applicable, particularly in the case + * of a complete expression with a leading wildcard. See the following examples: + *
    + *
  • Input {@code ".453.*" will return ".4.*|-.4.*"}
  • + *
  • Input {@code ".453.*" will return ".*4|-.*4"}
  • + *
  • Input {@code ".453.*" will return ".*?4|-.*?4"}
  • + *
  • Input {@code ".453.*" will return ".+4|-.+4"}
  • + *
  • Input {@code ".453.*" will return ".+?4|-.+?4"}
  • + *
+ * Regexes with leading wildcards that have a negative sign in front of them will not require any expansion. + */ +public class NegativeVariantExpander extends SubExpressionVisitor { + + public static Node expand(Node node) { + if (node == null) { + return null; + } + NegativeVariantExpander visitor = new NegativeVariantExpander(); + return (Node) node.accept(visitor, null); + } + + @Override + protected Object visitSubExpression(Node node) { + if (node.getFirstChild() instanceof AnyCharNode) { + return expandLeadingWildcard(node); + } else { + return copy(node); + } + } + + /** + * Return an expression that contains the original expression, as well as a negative variant of it. + * + * @param node + * the expression to expand + * @return the expanded expression + */ + private Node expandLeadingWildcard(Node node) { + // Create a copy of the original expression. + Node negativeCopy = copy(node); + + // Insert a negative sign directly before the wildcard character. + SingleCharNode negativeSign = new SingleCharNode(RegexConstants.HYPHEN); + negativeCopy.addChild(negativeSign, 0); + + // Create an alternation node with a copy of the original expression and the negative copy as its children. + AlternationNode alternation = new AlternationNode(); + alternation.addChild(copy(node)); + alternation.addChild(negativeCopy); + + // Return the alternation as the child of a new expression node. + return new ExpressionNode(alternation); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NonEncodedNumbersChecker.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NonEncodedNumbersChecker.java new file mode 100644 index 00000000000..4b6d029b95f --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NonEncodedNumbersChecker.java @@ -0,0 +1,68 @@ +package datawave.data.normalizer.regex.visitor; + +import datawave.data.normalizer.regex.AlternationNode; +import datawave.data.normalizer.regex.EncodedNumberNode; +import datawave.data.normalizer.regex.ExpressionNode; +import datawave.data.normalizer.regex.Node; + +/** + * An implementation of {@link BaseVisitor} that will examine a node tree and return whether any non-encoded patterns remain in the tree. This is intended to be + * used in conjunction with {@link SimpleNumberEncoder} to see if any further work remains to be done after encoding any and all simple numbers in the tree via + * {@link SimpleNumberEncoder#encode(Node)}. + * + * @see SimpleNumberEncoder + */ +public class NonEncodedNumbersChecker extends BaseVisitor { + + /** + * Check if there are any non-encoded number patterns still present in the tree. + * + * @param node + * the node to check + * @return true if there are any non-encoded patterns, or false otherwise. + */ + public static boolean check(Node node) { + NonEncodedNumbersChecker visitor = new NonEncodedNumbersChecker(); + node.accept(visitor, null); + return visitor.hasUnencodedPatterns; + } + + private boolean hasUnencodedPatterns = false; + + @Override + public Object visitExpression(ExpressionNode node, Object data) { + // If we have not yet found any unencoded patterns, check the node. + if (!this.hasUnencodedPatterns) { + // If we have an alternation, examine the alternation. + if (node.getFirstChild() instanceof AlternationNode) { + return super.visitExpression(node, data); + } else { + // Otherwise, check if the node's first child is an encoded number. + this.hasUnencodedPatterns = !(node.getFirstChild() instanceof EncodedNumberNode); + } + } + return null; + } + + @Override + public Object visitAlternation(AlternationNode node, Object data) { + // If we have not yet found any unencoded patterns, check each child. + if (!this.hasUnencodedPatterns) { + for (Node child : node.getChildren()) { + child.accept(this, data); + // If we found a child with an unencoded pattern, return early. + if (this.hasUnencodedPatterns) { + break; + } + } + + } + return null; + } + + @Override + public Object visitEncodedNumber(EncodedNumberNode node, Object data) { + // No need to traverse down into the children. + return null; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NumericCharClassValidator.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NumericCharClassValidator.java new file mode 100644 index 00000000000..e2c98cfb3b7 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NumericCharClassValidator.java @@ -0,0 +1,61 @@ +package datawave.data.normalizer.regex.visitor; + +import datawave.data.normalizer.regex.CharClassNode; +import datawave.data.normalizer.regex.CharRangeNode; +import datawave.data.normalizer.regex.EscapedSingleCharNode; +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.RegexConstants; +import datawave.data.normalizer.regex.SingleCharNode; + +/** + * Implementation of {@link BaseVisitor} that accepts a {@link Node} tree and verifies that any {@link CharClassNode} instances in it only have the following + * children: + *
    + *
  • A {@link SingleCharNode} that has a digit.
  • + *
  • A {@link CharRangeNode} that have a digit start and a digit end.
  • + *
+ */ +public class NumericCharClassValidator extends BaseVisitor { + + private static final String ERROR_MESSAGE = "Character classes may only contain numeric characters and numeric ranges."; + + public static void validate(Node node) { + if (node != null) { + NumericCharClassValidator visitor = new NumericCharClassValidator(); + node.accept(visitor, null); + } + } + + @Override + public Object visitCharClass(CharClassNode node, Object data) { + for (Node child : node.getChildren()) { + if (child instanceof EscapedSingleCharNode) { + // Do not allow any escaped characters. + throw new IllegalArgumentException(ERROR_MESSAGE); + } else if (child instanceof SingleCharNode) { + // Verify the character is a period or digit. + validate((SingleCharNode) child); + } else if (child instanceof CharRangeNode) { + // Verify the range is numeric. + validate((CharRangeNode) child); + } + } + return null; + } + + private void validate(SingleCharNode node) { + if (!RegexConstants.ALL_DIGITS.contains(node.getCharacter())) { + throw new IllegalArgumentException(ERROR_MESSAGE); + } + } + + private void validate(CharRangeNode node) { + if (!RegexConstants.ALL_DIGITS.contains(node.getStart())) { + throw new IllegalArgumentException(ERROR_MESSAGE); + } + if (!RegexConstants.ALL_DIGITS.contains(node.getEnd())) { + throw new IllegalArgumentException(ERROR_MESSAGE); + } + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/OptionalVariantExpander.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/OptionalVariantExpander.java new file mode 100644 index 00000000000..713d8c64826 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/OptionalVariantExpander.java @@ -0,0 +1,167 @@ +package datawave.data.normalizer.regex.visitor; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +import datawave.data.normalizer.regex.AlternationNode; +import datawave.data.normalizer.regex.ExpressionNode; +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.OneOrMoreNode; +import datawave.data.normalizer.regex.QuestionMarkNode; +import datawave.data.normalizer.regex.RegexUtils; +import datawave.data.normalizer.regex.RepetitionNode; +import datawave.data.normalizer.regex.ZeroOrMoreNode; + +/** + * Implementation of {@link CopyVisitor} that will return a copy of the tree where elements marked as optional are expanded such each optional character results + * in an alternation variant with the optional character present, and an alternation variant not present. This does not apply to optional found after a star, + * plus, or repetition quantifier, or any optionals applying to a character that occur after an escaped decimal point; e.g. in the cases of {@code ".*?111"}, + * {@code ".+?111"}, {@code "14{3}?1"}, or {@code "12\.4?"}. See the following examples of cases where an optional will result in variants. + *
    + *
  • Input {@code "2?"} will return {@code "2"}
  • + *
  • Input {@code "2.?5"} will return {@code "25|2.5"}
  • + *
  • Input {@code "2[3-9]?5"} will return {@code "25|2[2-9]5"}
  • + *
  • Input {@code "27?5"} will return {@code "25|275"}
  • + *
  • Input {@code "2(45.*)?5"} will return {@code "25|2(45.*)5"}
  • + *
  • Input {@code "2\.?5"} will return {@code "25|2\.5"}
  • + *
  • Input {@code "-?25"} will return {@code "25|-25"}
  • + *
+ */ +public class OptionalVariantExpander extends SubExpressionVisitor { + + public static Node expand(Node node) { + if (node == null) { + return null; + } + OptionalVariantExpander visitor = new OptionalVariantExpander(); + return (Node) node.accept(visitor, null); + } + + @Override + protected Object visitSubExpression(Node node) { + if (node.isAnyChildOf(QuestionMarkNode.class)) { + return expandOptionals(node); + } else { + return copy(node); + } + } + + /** + * Return an expression that contains the expanded variants of each expanded optional. + * + * @param node + * the expression to expand + * @return the expanded expression + */ + private Node expandOptionals(Node node) { + List expansions = new ArrayList<>(); + expansions.add(new ExpressionNode()); + + int startIndex = 0; + int optionalPos = node.indexOf(QuestionMarkNode.class); + int posBeforeOptional = optionalPos - 1; + int decimalPoint = RegexUtils.getDecimalPointIndex(node); + + // If the first optional found is after an escaped decimal point, there is no need to do any expansion. Return a copy of the copy. + if (decimalPoint != -1 && decimalPoint < posBeforeOptional) { + return copy(node); + } + + do { + // Children from the start index (inclusive) to the position before optional (not inclusive) can be added to each expansion. + expansions = addChildrenToExpansions(expansions, node, startIndex, posBeforeOptional); + // Move the start index to the position before the optional. + startIndex = posBeforeOptional; + + // If the optional is not a modifier to make a quantifier match in lazy mode, add expansions for each variant. + Node childBeforeOptional = node.getChildAt(posBeforeOptional); + if (!(isOptionalLazyModifierFor(childBeforeOptional))) { + expansions = addOptionalElement(expansions, childBeforeOptional); + startIndex = optionalPos + 1; + } + + // Determine the position of the next optional node, and the child before it. + optionalPos = node.indexOf(QuestionMarkNode.class, (optionalPos + 1)); + posBeforeOptional = optionalPos - 1; + + // If there is an escaped decimal point in the regex, and the next optional is for a character after it, there is no need to do any further + // expansion. + if (decimalPoint != -1 && decimalPoint < posBeforeOptional) { + break; + } + } while (optionalPos != -1); + + // If we have any remaining children to copy to each expansion, do so. + if (startIndex < (node.getChildCount())) { + expansions = addChildrenToExpansions(expansions, node, startIndex, node.getChildCount()); + } + + // Remove any expansions that are leafs without children. + expansions = expansions.stream().filter((ex) -> !ex.isLeaf()).collect(Collectors.toList()); + + // If we only have one expression after expansion, return the expression. + if (expansions.size() == 1) { + return expansions.get(0); + } else { + // Otherwise return an expression containing each expansion as an alternation. + return new ExpressionNode(new AlternationNode(expansions)); + } + } + + /** + * Return whether the given node is a *, +, or a repetition quantifier. + * + * @param node + * the node + * @return true if the node is a *, +, or a repetition quantifier, or false otherwise. + */ + private boolean isOptionalLazyModifierFor(Node node) { + return node instanceof ZeroOrMoreNode || node instanceof OneOrMoreNode || node instanceof RepetitionNode; + } + + /** + * Add the children of the given node from the start index (inclusive) to the end index (not inclusive) to each expansion in the list. + * + * @param expansions + * the expansions + * @param node + * the node + * @param startIndex + * the start index of children to copy (inclusive) + * @param endIndex + * the end index of children to copy (not inclusive) + * @return an updated list of expansions + */ + private List addChildrenToExpansions(List expansions, Node node, int startIndex, int endIndex) { + List newExpansions = new ArrayList<>(); + for (Node expansion : expansions) { + Node newExpansion = copy(expansion); + for (int index = startIndex; index < endIndex; index++) { + newExpansion.addChild(copy(node.getChildAt(index))); + } + newExpansions.add(newExpansion); + } + return newExpansions; + } + + /** + * Add the given optional element to each expansion, preserving a copy of each original expansion. + * + * @param expansions + * the expansions + * @param optionalElement + * the optional element + * @return an updated list of expansions + */ + private List addOptionalElement(List expansions, Node optionalElement) { + List newExpansions = new ArrayList<>(); + for (Node expansion : expansions) { + newExpansions.add(copy(expansion)); + Node newExpansion = copy(expansion); + newExpansion.addChild(copy(optionalElement)); + newExpansions.add(newExpansion); + } + return newExpansions; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/PrintVisitor.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/PrintVisitor.java new file mode 100644 index 00000000000..d3d9825b1ed --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/PrintVisitor.java @@ -0,0 +1,223 @@ +package datawave.data.normalizer.regex.visitor; + +import datawave.data.normalizer.regex.AlternationNode; +import datawave.data.normalizer.regex.AnyCharNode; +import datawave.data.normalizer.regex.CharClassNode; +import datawave.data.normalizer.regex.CharRangeNode; +import datawave.data.normalizer.regex.DigitCharClassNode; +import datawave.data.normalizer.regex.EmptyNode; +import datawave.data.normalizer.regex.EncodedNumberNode; +import datawave.data.normalizer.regex.EncodedPatternNode; +import datawave.data.normalizer.regex.EndAnchorNode; +import datawave.data.normalizer.regex.EscapedSingleCharNode; +import datawave.data.normalizer.regex.ExpressionNode; +import datawave.data.normalizer.regex.GroupNode; +import datawave.data.normalizer.regex.IntegerNode; +import datawave.data.normalizer.regex.IntegerRangeNode; +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.OneOrMoreNode; +import datawave.data.normalizer.regex.QuestionMarkNode; +import datawave.data.normalizer.regex.RepetitionNode; +import datawave.data.normalizer.regex.SingleCharNode; +import datawave.data.normalizer.regex.StartAnchorNode; +import datawave.data.normalizer.regex.ZeroOrMoreNode; + +/** + * A {@link Visitor} implementation that accepts a {@link Node} tree and streams a pretty-print of it to {@link System#out}. + */ +public class PrintVisitor implements Visitor { + + private static final String PREFIX = " "; + + private interface Output { + void write(String line); + } + + private static class SystemOutput implements Output { + + @Override + public void write(String line) { + System.out.println(line); + } + } + + private static class StringBuilderOutput implements Output { + + private final StringBuilder sb = new StringBuilder(); + + @Override + public void write(String line) { + sb.append("\n").append(line); + } + } + + /** + * Streams a pretty-print of the given node to {@link System#out}. + * + * @param node + * the node to print + */ + public static void printToSysOut(Node node) { + if (node == null) { + System.out.println("null"); + } else { + PrintVisitor visitor = new PrintVisitor(new SystemOutput()); + node.accept(visitor, ""); + } + } + + /** + * Returns a string containing a pretty print of the given node. + * + * @param node + * the node + * @return the string + */ + public static String printToString(Node node) { + if (node == null) { + return "null"; + } else { + StringBuilderOutput output = new StringBuilderOutput(); + PrintVisitor visitor = new PrintVisitor(output); + node.accept(visitor, ""); + return output.sb.toString(); + } + } + + private final Output output; + + protected PrintVisitor(Output output) { + this.output = output; + } + + private void print(Node node, Object data) { + printLine(node, data); + if (node != null) { + node.childrenAccept(this, (data + PREFIX)); + } + } + + private void printLine(Node node, Object data) { + output.write(data + "" + node); + } + + @Override + public Object visitExpression(ExpressionNode node, Object data) { + print(node, data); + return null; + } + + @Override + public Object visitAlternation(AlternationNode node, Object data) { + print(node, data); + return null; + } + + @Override + public Object visitGroup(GroupNode node, Object data) { + print(node, data); + return null; + } + + @Override + public Object visitDigitChar(DigitCharClassNode node, Object data) { + print(node, data); + return null; + } + + @Override + public Object visitCharClass(CharClassNode node, Object data) { + print(node, data); + return null; + } + + @Override + public Object visitCharRange(CharRangeNode node, Object data) { + print(node, data); + return null; + } + + @Override + public Object visitSingleChar(SingleCharNode node, Object data) { + print(node, data); + return null; + } + + @Override + public Object visitEscapedSingleChar(EscapedSingleCharNode node, Object data) { + print(node, data); + return null; + } + + @Override + public Object visitRepetition(RepetitionNode node, Object data) { + print(node, data); + return null; + } + + @Override + public Object visitQuestionMark(QuestionMarkNode node, Object data) { + print(node, data); + return null; + } + + @Override + public Object visitAnyChar(AnyCharNode node, Object data) { + print(node, data); + return null; + } + + @Override + public Object visitZeroToMany(ZeroOrMoreNode node, Object data) { + print(node, data); + return null; + } + + @Override + public Object visitOneToMany(OneOrMoreNode node, Object data) { + print(node, data); + return null; + } + + @Override + public Object visitInteger(IntegerNode node, Object data) { + print(node, data); + return null; + } + + @Override + public Object visitIntegerRange(IntegerRangeNode node, Object data) { + print(node, data); + return null; + } + + @Override + public Object visitEmpty(EmptyNode node, Object data) { + print(node, data); + return null; + } + + @Override + public Object visitStartAnchor(StartAnchorNode node, Object data) { + print(node, data); + return null; + } + + @Override + public Object visitEndAnchor(EndAnchorNode node, Object data) { + print(node, data); + return null; + } + + @Override + public Object visitEncodedNumber(EncodedNumberNode node, Object data) { + print(node, data); + return null; + } + + @Override + public Object visitEncodedPattern(EncodedPatternNode node, Object data) { + print(node, data); + return null; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/SimpleNumberEncoder.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/SimpleNumberEncoder.java new file mode 100644 index 00000000000..8d24581b64d --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/SimpleNumberEncoder.java @@ -0,0 +1,90 @@ +package datawave.data.normalizer.regex.visitor; + +import datawave.data.normalizer.regex.EncodedNumberNode; +import datawave.data.normalizer.regex.EndAnchorNode; +import datawave.data.normalizer.regex.EscapedSingleCharNode; +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.RegexParser; +import datawave.data.normalizer.regex.RegexUtils; +import datawave.data.normalizer.regex.SingleCharNode; +import datawave.data.normalizer.regex.StartAnchorNode; +import datawave.data.type.util.NumericalEncoder; + +/** + * An implementation of {@link CopyVisitor} that will encode any simple-numbers in the regex pattern, and store them inside a {@link EncodedNumberNode} + * instance. Any expressions that do not represent a simple number will not be modified. See the following examples: + *
    + *
  • Input {@code "123\.45"} will return {@code "\+cE1\.2345"}.
  • + *
  • Input {@code "23.*"} will return {@code "23.*"}.
  • + *
  • Input {@code "-342|23.*"} will return {@code "!XE6\.58|23.*"}.
  • + *
+ */ +public class SimpleNumberEncoder extends SubExpressionVisitor { + + /** + * Return a copy of the given tree with all simple numbers encoded. + * + * @param node + * the node to encode + * @return the encoded node + */ + public static Node encode(Node node) { + if (node == null) { + return null; + } + SimpleNumberEncoder visitor = new SimpleNumberEncoder(); + return (Node) node.accept(visitor, null); + } + + @Override + protected Object visitSubExpression(Node node) { + // If the expression is a simple number, encode it. + if (RegexUtils.isSimpleNumber(node)) { + Node normalized = normalizeNumber(node); + return new EncodedNumberNode(normalized.getChildren()); + } else { + // Otherwise return a copy. + return copy(node); + } + } + + /** + * Create an encoded simple number regex from the given node. It is expected that the given node represents a simple number regex. + * + * @param node + * the node to encode + * @return the encoded node. + */ + private Node normalizeNumber(Node node) { + // Create a number string from the node. Do not include backlashes or anchor characters. + StringBuilder sb = new StringBuilder(); + for (Node child : node.getChildren()) { + if (child instanceof EscapedSingleCharNode) { + sb.append(((EscapedSingleCharNode) child).getCharacter()); + } else if (child instanceof SingleCharNode) { + sb.append(((SingleCharNode) child).getCharacter()); + } + } + + // Encode and escape the number. + String encodedNumber = NumericalEncoder.encode(sb.toString()); + encodedNumber = RegexUtils.escapeEncodedNumber(encodedNumber); + + // Parse the number to a node. + Node encodedNode = RegexParser.parse(encodedNumber); + + // If the original expression contained a starting anchor, include it in the encoded node. + Node firstChild = node.getFirstChild(); + if (firstChild instanceof StartAnchorNode) { + encodedNode.addChild(firstChild.shallowCopy(), 0); + } + + // If the original expression contained an ending anchor, include it in the encoded node. + Node lastChild = node.getLastChild(); + if (lastChild instanceof EndAnchorNode) { + encodedNode.addChild(lastChild.shallowCopy()); + } + + return encodedNode; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/StringVisitor.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/StringVisitor.java new file mode 100644 index 00000000000..3f88cf1ff9c --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/StringVisitor.java @@ -0,0 +1,192 @@ +package datawave.data.normalizer.regex.visitor; + +import java.util.Iterator; + +import datawave.data.normalizer.regex.AlternationNode; +import datawave.data.normalizer.regex.AnyCharNode; +import datawave.data.normalizer.regex.CharClassNode; +import datawave.data.normalizer.regex.CharRangeNode; +import datawave.data.normalizer.regex.DigitCharClassNode; +import datawave.data.normalizer.regex.EmptyNode; +import datawave.data.normalizer.regex.EncodedNumberNode; +import datawave.data.normalizer.regex.EncodedPatternNode; +import datawave.data.normalizer.regex.EndAnchorNode; +import datawave.data.normalizer.regex.EscapedSingleCharNode; +import datawave.data.normalizer.regex.ExpressionNode; +import datawave.data.normalizer.regex.GroupNode; +import datawave.data.normalizer.regex.IntegerNode; +import datawave.data.normalizer.regex.IntegerRangeNode; +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.OneOrMoreNode; +import datawave.data.normalizer.regex.QuestionMarkNode; +import datawave.data.normalizer.regex.RepetitionNode; +import datawave.data.normalizer.regex.SingleCharNode; +import datawave.data.normalizer.regex.StartAnchorNode; +import datawave.data.normalizer.regex.ZeroOrMoreNode; + +public class StringVisitor implements Visitor { + + public static String toString(Node node) { + if (node == null) { + return null; + } + StringVisitor visitor = new StringVisitor(); + StringBuilder sb = new StringBuilder(); + node.accept(visitor, sb); + return sb.toString(); + } + + @Override + public Object visitExpression(ExpressionNode node, Object data) { + node.childrenAccept(this, data); + return null; + } + + @Override + public Object visitAlternation(AlternationNode node, Object data) { + StringBuilder sb = (StringBuilder) data; + Iterator iterator = node.getChildren().iterator(); + while (iterator.hasNext()) { + iterator.next().accept(this, sb); + if (iterator.hasNext()) { + sb.append("|"); + } + } + return null; + } + + @Override + public Object visitGroup(GroupNode node, Object data) { + StringBuilder sb = (StringBuilder) data; + sb.append("("); + node.childrenAccept(this, sb); + sb.append(")"); + return null; + } + + @Override + public Object visitDigitChar(DigitCharClassNode node, Object data) { + StringBuilder sb = (StringBuilder) data; + sb.append("\\d"); + return null; + } + + @Override + public Object visitCharClass(CharClassNode node, Object data) { + StringBuilder sb = (StringBuilder) data; + sb.append("["); + if (node.isNegated()) { + sb.append("^"); + } + node.childrenAccept(this, sb); + sb.append("]"); + return null; + } + + @Override + public Object visitCharRange(CharRangeNode node, Object data) { + StringBuilder sb = (StringBuilder) data; + sb.append(node.getStart()).append("-").append(node.getEnd()); + return null; + } + + @Override + public Object visitSingleChar(SingleCharNode node, Object data) { + StringBuilder sb = (StringBuilder) data; + sb.append(node.getCharacter()); + return null; + } + + @Override + public Object visitEscapedSingleChar(EscapedSingleCharNode node, Object data) { + StringBuilder sb = (StringBuilder) data; + sb.append("\\").append(node.getCharacter()); + return null; + } + + @Override + public Object visitRepetition(RepetitionNode node, Object data) { + StringBuilder sb = (StringBuilder) data; + sb.append("{"); + node.childrenAccept(this, sb); + sb.append("}"); + return null; + } + + @Override + public Object visitQuestionMark(QuestionMarkNode node, Object data) { + StringBuilder sb = (StringBuilder) data; + sb.append("?"); + return null; + } + + @Override + public Object visitAnyChar(AnyCharNode node, Object data) { + StringBuilder sb = (StringBuilder) data; + sb.append("."); + return null; + } + + @Override + public Object visitZeroToMany(ZeroOrMoreNode node, Object data) { + StringBuilder sb = (StringBuilder) data; + sb.append("*"); + return null; + } + + @Override + public Object visitOneToMany(OneOrMoreNode node, Object data) { + StringBuilder sb = (StringBuilder) data; + sb.append("+"); + return null; + } + + @Override + public Object visitInteger(IntegerNode node, Object data) { + StringBuilder sb = (StringBuilder) data; + sb.append(node.getValue()); + return null; + } + + @Override + public Object visitIntegerRange(IntegerRangeNode node, Object data) { + StringBuilder sb = (StringBuilder) data; + sb.append(node.getStart()); + sb.append(","); + if (node.isEndBounded()) { + sb.append(node.getEnd()); + } + return null; + } + + @Override + public Object visitEmpty(EmptyNode node, Object data) { + return null; + } + + @Override + public Object visitStartAnchor(StartAnchorNode node, Object data) { + StringBuilder sb = (StringBuilder) data; + sb.append("^"); + return null; + } + + @Override + public Object visitEndAnchor(EndAnchorNode node, Object data) { + StringBuilder sb = (StringBuilder) data; + sb.append("$"); + return null; + } + + @Override + public Object visitEncodedNumber(EncodedNumberNode node, Object data) { + node.childrenAccept(this, data); + return null; + } + + @Override + public Object visitEncodedPattern(EncodedPatternNode node, Object data) { + node.childrenAccept(this, data); + return null; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/SubExpressionVisitor.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/SubExpressionVisitor.java new file mode 100644 index 00000000000..53919964b2f --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/SubExpressionVisitor.java @@ -0,0 +1,98 @@ +package datawave.data.normalizer.regex.visitor; + +import java.util.ArrayList; +import java.util.List; +import java.util.Set; + +import com.google.common.collect.ImmutableSet; + +import datawave.data.normalizer.regex.AlternationNode; +import datawave.data.normalizer.regex.EncodedNumberNode; +import datawave.data.normalizer.regex.EncodedPatternNode; +import datawave.data.normalizer.regex.ExpressionNode; +import datawave.data.normalizer.regex.GroupNode; +import datawave.data.normalizer.regex.Node; + +/** + * An implementation of {@link CopyVisitor} that allows delegation of operations to be performed on sub-expressions of a regex, specifically, each alternated + * expression of a regex with alternations, or the entire expression if no alternations are present. + */ +public class SubExpressionVisitor extends CopyVisitor { + + private static final Set> VALID_TOP_LEVEL_TYPES = ImmutableSet.of(GroupNode.class, EncodedNumberNode.class, EncodedPatternNode.class); + + @Override + public Object visitExpression(ExpressionNode node, Object data) { + if (node.getFirstChild() instanceof AlternationNode) { + return super.visitExpression(node, data); + } else { + return visitSubExpression(node); + } + } + + /** + * By default, return a copy of the sub-expression. This method should be overridden by any subclasses that need to manipulate sub-expressions. + * + * @param node + * the sub-expression + * @return the visited sub-expression + */ + protected Object visitSubExpression(Node node) { + return copy(node); + } + + /** + * Visit each sub-expression of the alternation with this visitor. + * + * @param node + * the alternation node + * @param data + * the data + * @return null if all visited children returned null, an {@link ExpressionNode} if a single visited child returned a non-null result, or an + * {@link AlternationNode} with all non-null results from visited children + */ + @Override + public Object visitAlternation(AlternationNode node, Object data) { + List children = new ArrayList<>(); + // Visit each alternated child. + for (Node child : node.getChildren()) { + Node visited = (Node) child.accept(this, data); + // Do not retain null children. + if (visited != null) { + // If the returned node is an alternation node, retain each child of the returned alternation node. + if (visited instanceof AlternationNode) { + children.addAll(visited.getChildren()); + } else if (visited instanceof ExpressionNode) { + if (visited.getChildCount() == 1 && visited.getFirstChild() instanceof AlternationNode) { + // If the returned node is an expression with an alternation child, retain each child of the alternation node. + children.addAll(visited.getFirstChild().getChildren()); + } else if (visited.getChildCount() == 1 && VALID_TOP_LEVEL_TYPES.contains(visited.getFirstChild().getClass())) { + // If the returned node is an expression with a single child that is a top-level node type, retain the first child. + children.add(visited.getFirstChild()); + } else { + // Otherwise retain the entire expression. + children.add(visited); + } + } else if (VALID_TOP_LEVEL_TYPES.contains(visited.getClass())) { + // If the returned node is a valid top-level class, retain it. + children.add(visited); + } else { + throw new IllegalArgumentException("Visited alternation child must be alternation or expression, but was " + visited); + } + } + } + + // If there are no children, return null. + if (children.isEmpty()) { + return null; + } else if (children.size() == 1) { + // If there is only one child, return the child. + return children.get(0); + } else { + // Otherwise return a new alternation node. + AlternationNode copy = new AlternationNode(); + copy.addChildren(children); + return copy; + } + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/Visitor.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/Visitor.java new file mode 100644 index 00000000000..b9ebcc01dd0 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/Visitor.java @@ -0,0 +1,65 @@ +package datawave.data.normalizer.regex.visitor; + +import datawave.data.normalizer.regex.AlternationNode; +import datawave.data.normalizer.regex.AnyCharNode; +import datawave.data.normalizer.regex.CharClassNode; +import datawave.data.normalizer.regex.CharRangeNode; +import datawave.data.normalizer.regex.DigitCharClassNode; +import datawave.data.normalizer.regex.EmptyNode; +import datawave.data.normalizer.regex.EncodedNumberNode; +import datawave.data.normalizer.regex.EncodedPatternNode; +import datawave.data.normalizer.regex.EndAnchorNode; +import datawave.data.normalizer.regex.EscapedSingleCharNode; +import datawave.data.normalizer.regex.ExpressionNode; +import datawave.data.normalizer.regex.GroupNode; +import datawave.data.normalizer.regex.IntegerNode; +import datawave.data.normalizer.regex.IntegerRangeNode; +import datawave.data.normalizer.regex.OneOrMoreNode; +import datawave.data.normalizer.regex.QuestionMarkNode; +import datawave.data.normalizer.regex.RepetitionNode; +import datawave.data.normalizer.regex.SingleCharNode; +import datawave.data.normalizer.regex.StartAnchorNode; +import datawave.data.normalizer.regex.ZeroOrMoreNode; + +public interface Visitor { + + Object visitExpression(ExpressionNode node, Object data); + + Object visitAlternation(AlternationNode node, Object data); + + Object visitGroup(GroupNode node, Object data); + + Object visitDigitChar(DigitCharClassNode node, Object data); + + Object visitCharClass(CharClassNode node, Object data); + + Object visitCharRange(CharRangeNode node, Object data); + + Object visitSingleChar(SingleCharNode node, Object data); + + Object visitEscapedSingleChar(EscapedSingleCharNode node, Object data); + + Object visitRepetition(RepetitionNode node, Object data); + + Object visitQuestionMark(QuestionMarkNode node, Object data); + + Object visitAnyChar(AnyCharNode node, Object data); + + Object visitZeroToMany(ZeroOrMoreNode node, Object data); + + Object visitOneToMany(OneOrMoreNode node, Object data); + + Object visitInteger(IntegerNode node, Object data); + + Object visitIntegerRange(IntegerRangeNode node, Object data); + + Object visitEmpty(EmptyNode node, Object data); + + Object visitStartAnchor(StartAnchorNode node, Object data); + + Object visitEndAnchor(EndAnchorNode node, Object data); + + Object visitEncodedNumber(EncodedNumberNode node, Object data); + + Object visitEncodedPattern(EncodedPatternNode node, Object data); +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ZeroLengthRepetitionTrimmer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ZeroLengthRepetitionTrimmer.java new file mode 100644 index 00000000000..5c7eb5b8cd3 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ZeroLengthRepetitionTrimmer.java @@ -0,0 +1,97 @@ +package datawave.data.normalizer.regex.visitor; + +import datawave.data.normalizer.regex.ExpressionNode; +import datawave.data.normalizer.regex.IntegerNode; +import datawave.data.normalizer.regex.IntegerRangeNode; +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.NodeListIterator; +import datawave.data.normalizer.regex.QuestionMarkNode; +import datawave.data.normalizer.regex.RepetitionNode; + +/** + * Implementation of {@link CopyVisitor} that will return a copy of the tree trimmed of any characters that were immediately followed by a zero-length + * repetition quantifier, i.e. {@code {0}} or {@code {0,0}}. See the following examples: + *
    + *
  • Input {@code "123.*"} will return {@code "123.*"}.
  • + *
  • Input {@code "123{3}"} will return {@code "123{3}"}.
  • + *
  • Input {@code "12[3-6]{0}"} will return {@code "12"}.
  • + *
  • Input {@code "12[3-6]{0,0}"} will return {@code "12"}.
  • + *
  • Input {@code "2{0,0}|3{0}"} will return null.
  • + *
+ */ +public class ZeroLengthRepetitionTrimmer extends SubExpressionVisitor { + + /** + * Return a copy of the given tree trimmed of all characters followed by a zero-length repetition quantifier. If the entire tree is trimmed, null will be + * returned, otherwise an {@link ExpressionNode} with the trimmed tree will be returned. + * + * @param node + * the node to trim + * @return the trimmed node + */ + public static Node trim(Node node) { + if (node == null) { + return null; + } + ZeroLengthRepetitionTrimmer visitor = new ZeroLengthRepetitionTrimmer(); + return (Node) node.accept(visitor, null); + } + + @Override + public Object visitExpression(ExpressionNode node, Object data) { + Node visited = (Node) super.visitExpression(node, data); + return visited != null && visited.isLeaf() ? null : visited; + } + + @Override + protected Object visitSubExpression(Node node) { + Node copy = new ExpressionNode(); + NodeListIterator iter = node.getChildrenIterator(); + + // Check each child for any zero-length repetitions. + while (iter.hasNext()) { + Node next = iter.next(); + if (iter.hasNext() && iter.isNextInstanceOf(RepetitionNode.class)) { + Node repetition = iter.next(); + // If we have a zero-length repetition, do not copy it. + if (isZeroLengthRepetition(repetition)) { + // If there is a ? after the repetition, move past it. + if (iter.hasNext() && iter.isNextInstanceOf(QuestionMarkNode.class)) { + iter.next(); + } + } else { + // Otherwise this is a non-zero length repetition. Copy it. + copy.addChild(copy(next)); + copy.addChild(copy(repetition)); + } + } else { + // The child is not followed by a repetition. Copy it. + copy.addChild(copy(next)); + } + } + + // If we have any children after removing zero-length repetitions, return the copy. Otherwise, return null. + if (copy.hasChildren()) { + return copy; + } else { + return null; + } + } + + /** + * Return whether the given repetition is {@code {0}} or {@code {0,0}}. + * + * @param node + * the node + * @return true if the given repetition is a zero-length repetition, or false otherwise + */ + private boolean isZeroLengthRepetition(Node node) { + Node child = node.getFirstChild(); + if (child instanceof IntegerNode) { + return ((IntegerNode) child).getValue() == 0; + } else { + IntegerRangeNode rangeNode = (IntegerRangeNode) child; + return rangeNode.getStart() == 0 && rangeNode.isEndBounded() && rangeNode.getEnd() == 0; + } + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ZeroTrimmer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ZeroTrimmer.java new file mode 100644 index 00000000000..fec62759a6e --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ZeroTrimmer.java @@ -0,0 +1,722 @@ +package datawave.data.normalizer.regex.visitor; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.commons.lang3.tuple.Pair; + +import datawave.data.normalizer.ZeroRegexStatus; +import datawave.data.normalizer.regex.AnyCharNode; +import datawave.data.normalizer.regex.EncodedPatternNode; +import datawave.data.normalizer.regex.EscapedSingleCharNode; +import datawave.data.normalizer.regex.ExpressionNode; +import datawave.data.normalizer.regex.GroupNode; +import datawave.data.normalizer.regex.IntegerNode; +import datawave.data.normalizer.regex.IntegerRangeNode; +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.NodeListIterator; +import datawave.data.normalizer.regex.NodeType; +import datawave.data.normalizer.regex.OneOrMoreNode; +import datawave.data.normalizer.regex.QuestionMarkNode; +import datawave.data.normalizer.regex.RegexConstants; +import datawave.data.normalizer.regex.RegexUtils; +import datawave.data.normalizer.regex.RepetitionNode; +import datawave.data.normalizer.regex.SingleCharNode; +import datawave.data.normalizer.regex.ZeroOrMoreNode; + +/** + * Implementation of {@link CopyVisitor} that trims and consolidates leading zeros for partially encoded regex patterns. + */ +public class ZeroTrimmer extends CopyVisitor { + + /** + * Return a copy of the node tree with all leading zeros for partially encoded regex patterns either trimmed and/or consolidated. + * + * @param node + * the node + * @return the trimmed tree + */ + public static Node trim(Node node) { + if (node == null) { + return null; + } + ZeroTrimmer visitor = new ZeroTrimmer(); + return (Node) node.accept(visitor, null); + } + + public static ZeroRegexStatus getStatus(List encodedRegexNodes) { + if (hasPossiblyLeadingZeroes(encodedRegexNodes)) { + return ZeroRegexStatus.LEADING; + } else if (hasTrailingZeroes(encodedRegexNodes)) { + return ZeroRegexStatus.TRAILING; + } else + return ZeroRegexStatus.NONE; + + } + + private static boolean hasTrailingZeroes(List encodedRegexNodes) { + Collections.reverse(encodedRegexNodes); + + NodeListIterator iter = new NodeListIterator(encodedRegexNodes); + + while (iter.hasNext()) { + iter.seekPastQuestionMarks(); + iter.seekPastQuantifiers(); + iter.seekPastQuestionMarks(); + + Node next = iter.peekNext(); + + if (RegexUtils.matchesZero(next)) { + if (RegexUtils.matchesZeroExplicitly(next)) { + return true; + } + iter.next(); + } else { + return false; + } + + } + return true; + + } + + private static boolean hasPossiblyLeadingZeroes(List encodedRegexNodes) { + NodeListIterator iter = new NodeListIterator(encodedRegexNodes); + + while (iter.hasNext()) { + Node next = iter.peekNext(); + + if (RegexUtils.matchesZero(next)) { + return true; + } else if (RegexUtils.isChar(next, RegexConstants.HYPHEN) || next.equals(new EscapedSingleCharNode(RegexConstants.PERIOD))) { + iter.next(); + } else { + return false; + } + } + + return true; + + } + + @Override + public Object visitEncodedPattern(EncodedPatternNode node, Object data) { + EncodedPatternNode trimmed = new EncodedPatternNode(); + + // Create a new node and add each child up to (inclusively) the 'E' character. + int startOfRemainingNodes = 0; + for (int i = 0; i < node.getChildCount(); i++) { + Node child = node.getChildAt(i); + trimmed.addChild(copy(child)); + if (RegexUtils.isChar(child, RegexConstants.CAPITAL_E)) { + startOfRemainingNodes = i + 1; + break; + } + } + + // Copy the remaining children into a separate list. This list will be modified as zeros are trimmed. + List nodes = new ArrayList<>(); + for (int i = startOfRemainingNodes; i < node.getChildCount(); i++) { + Node child = node.getChildAt(i); + // At this point we no longer need to keep the original decimal point. A new decimal point will be added later in the correct spot. + if (!RegexUtils.isDecimalPoint(child)) { + nodes.add(copy(child)); + } + } + + // Check if the remaining children represent a single regex element. If so, no trimming is required. + if (isSingleElementPattern(nodes)) { + trimmed.addChildren(nodes); + return trimmed; + } + + // Trim leading and trailing zeros. + nodes = trimLeadingZeros(nodes); + nodes = trimTrailingZeros(nodes); + + // Add the new nodes to the node to return. + trimmed.addChildren(nodes); + return trimmed; + } + + /** + * Trim/consolidate leading zeros. + * + * @param nodes + * the nodes to trim + * @return the trimmed nodes + */ + private List trimLeadingZeros(List nodes) { + nodes = trimLeadingZeroOnlyElements(nodes); + return consolidatePossibleLeadingZeros(nodes); + } + + /** + * Trim/consolidate trailing zeros. + * + * @param nodes + * the nodes to trim + * @return the trimmed nodes + */ + private List trimTrailingZeros(List nodes) { + // Reverse the nodes. + Collections.reverse(nodes); + nodes = trimTrailingZeroOnlyElements(nodes); + nodes = consolidatePossibleTrailingZeros(nodes); + // Restore the original order. + Collections.reverse(nodes); + return nodes; + } + + /** + * Return true if the given list consists only of one regex element that may or may not be followed by a quantifier or question mark. + * + * @param nodes + * the nodes + * @return true if the list consists of a single element pattern, or false otherwise + */ + private boolean isSingleElementPattern(List nodes) { + NodeListIterator iter = new NodeListIterator(nodes); + iter.next(); + iter.seekPastQuantifiers(); + iter.seekPastQuestionMarks(); + return !iter.hasNext(); + } + + /** + * Trim all leading nodes that only match zero. Trimming will stop once the first element that can match something other than zero is seen. + * + * @param nodes + * the nodes + * @return a list of trimmed nodes + */ + private List trimLeadingZeroOnlyElements(List nodes) { + NodeListIterator iter = new NodeListIterator(nodes); + while (iter.hasNext()) { + Node next = iter.peekNext(); + // If the next element matches zero only, skip past it, and any quantifiers and/or question marks after it. + if (RegexUtils.matchesZeroOnly(next)) { + iter.next(); + iter.seekPastQuantifiers(); + iter.seekPastQuestionMarks(); + } else { + break; + } + } + + // If no leading zeros were seen, return the original list, otherwise return a sublist. + return iter.index() == 0 ? nodes : new ArrayList<>(nodes.subList(iter.index(), nodes.size())); + } + + /** + * Return a list with all possible leading zeros consolidated, and any elements made optional as needed. + * + * @param nodes + * the nodes to consolidate + * @return a list of consolidated nodes + */ + private List consolidatePossibleLeadingZeros(List nodes) { + // If the first node cannot match zero, there is nothing further to do. Return the entire list. + if (!RegexUtils.matchesZero(nodes.get(0))) { + return nodes; + } + + // Iterate through each child. + NodeListIterator iter = new NodeListIterator(nodes); + List consolidated = new ArrayList<>(); + while (iter.hasNext()) { + // Do not call next until we know the next node can match zero. + Node next = iter.peekNext(); + // The next node can match zero. Call next, and call the specific consolidation method based on whether the node can match only zero, or other + // numbers. + if (RegexUtils.matchesZero(next)) { + if (RegexUtils.matchesZeroOnly(next)) { + consolidated.addAll(consolidateLeadingMatchesZeroOnly(iter)); + } else { + consolidated.addAll(consolidateLeadingMatchesZero(iter)); + } + } else { + break; + } + } + + // Add the remaining nodes to the list to return. + while (iter.hasNext()) { + consolidated.add(iter.next()); + } + return consolidated; + } + + /** + * Consolidate any leading zeros that can possibly match zero. + * + * @param iter + * the iterator + * @return the consolidated nodes. + */ + private List consolidateLeadingMatchesZero(NodeListIterator iter) { + List nodes = new ArrayList<>(); + while (iter.hasNext()) { + // Do not call next until we know the next node can match zero. + Node next = iter.peekNext(); + // The next node can match zero. The first call to next should always return an element that can match zero, but not only zero. + if (RegexUtils.matchesZero(next)) { + iter.next(); + // If the node is followed by a quantifier and/or optional, evaluate the quantifier. + if (iter.isNextQuantifier()) { + Node quantifier = iter.next(); + switch (quantifier.getType()) { + case ZERO_OR_MORE: + case ONE_OR_MORE: + // In both the case of * or + for a leading zero, we must ensure that * is used in the final regex to allow for zero occurrences of + // the leading zero when matching. + nodes.add(next); + nodes.add(new ZeroOrMoreNode()); + // If the quantifier was followed by ?, append the ?. + if (iter.isNextQuestionMark()) { + nodes.add(iter.next()); + } + break; + case REPETITION: + RepetitionNode repetition = (RepetitionNode) quantifier; + // If the repetition does not already allow for zero occurrences, we must create a new repetition quantifier that does so. + if (!RegexUtils.repetitionCanOccurZeroTimes(repetition)) { + if (RegexUtils.isNotRange(repetition)) { + // If the repetition is has the form {x}, replace it with {0,x}. For example, "[012]{3}" will become "[012]{0,3}". + nodes.add(next); + nodes.add(RegexUtils.createRangeStartingFromZero(repetition)); + // If the original quantifier was followed by ?, append it. + if (iter.isNextQuestionMark()) { + nodes.add(iter.next()); + } + } else { + // If the repetition has the form {x,y}, where x is a value greater than zero, we must wrap the element and the repetition + // in an optional group to allow for it to occur either zero times, or x-y times. For example, "[012]{3,5}" will become + // "([012]{3,5})?". Create a group node with the element and repetition as its children. + GroupNode groupNode = new GroupNode(); + groupNode.addChild(next); + groupNode.addChild(repetition); + // If the original quantifier was followed by ?, include it in the group. + if (iter.isNextQuestionMark()) { + groupNode.addChild(iter.next()); + } + // Add the group node and make it optional. + nodes.add(groupNode); + nodes.add(new QuestionMarkNode()); + } + } else { + // The repetition allows for zero occurrences. No modifications need to be made. + nodes.add(next); + nodes.add(repetition); + if (iter.isNextQuestionMark()) { + nodes.add(iter.next()); + } + } + break; + default: + throw new IllegalArgumentException("Unsupported quantifier type: " + quantifier.getType()); + } + } else { + // Add the node and make it optional since it can possibly be a leading zero, and thus must be optional. + nodes.add(next); + nodes.add(new QuestionMarkNode()); + } + + // If there are any elements directly after the current element that only match zero, consolidate then and add the result. + if (iter.hasNext() && RegexUtils.matchesZeroOnly(iter.peekNext())) { + nodes.addAll(consolidateLeadingMatchesZeroOnly(iter)); + } + } else { + // The next element cannot match zero. Nothing more to do. + break; + } + } + return nodes; + } + + /** + * Consolidate the next consecutive elements that can only match zero. + * + * @param iter + * the iterator + * @return a list of the consolidated nodes + */ + private List consolidateLeadingMatchesZeroOnly(NodeListIterator iter) { + // We need to track the minimum and maximum times a leading zero can occur. + int minZeroCount = 0; + int maxZeroCount = 0; + + while (iter.hasNext()) { + // Do not call next until we've confirmed the next node only matches zero. + Node next = iter.peekNext(); + if (RegexUtils.matchesZeroOnly(next)) { + // Explicitly call next now. + iter.next(); + // If the zero has a quantifier, extract the quantifier range. + if (iter.isNextQuantifier()) { + Pair quantifierRange = RegexUtils.getQuantifierRange(iter.next()); + // Increment the lower bound. + minZeroCount += quantifierRange.getLeft(); + if (maxZeroCount != -1) { + // If the quantifier range has no defined upper bound, that is equivalent to unlimited. Set the max bound to -1 to ensure it is not + // changed. + if (quantifierRange.getRight() == null) { + maxZeroCount = -1; + } else { + // Otherwise increment the upper bound. + maxZeroCount += quantifierRange.getRight(); + } + } + } else { + // The zero does not have a quantifier. Increment the min count by one, and increment the max count only if we have not yet determined that + // the max should be considered unlimited. + minZeroCount++; + if (maxZeroCount != -1) { + maxZeroCount++; + } + } + // Skip any question marks if present. + iter.seekPastQuestionMarks(); + } else { + // If the next node does not only match zero, stop iterating. + break; + } + } + + List nodes = new ArrayList<>(); + // If the min and max are both 1, return 0? + if (minZeroCount == 1 && maxZeroCount == 1) { + nodes.add(new SingleCharNode(RegexConstants.ZERO)); + nodes.add(new QuestionMarkNode()); + } else { + // Otherwise we need return 0 followed by a quantifier inside an optional group. + GroupNode groupNode = new GroupNode(); + groupNode.addChild(new SingleCharNode(RegexConstants.ZERO)); + + if (maxZeroCount == -1 && minZeroCount < 2) { + if (minZeroCount == 0) { + // Return (0*)? + groupNode.addChild(new ZeroOrMoreNode()); + } else if (minZeroCount == 1) { + // Return (0+)? + groupNode.addChild(new OneOrMoreNode()); + } + } else { + RepetitionNode repetition = new RepetitionNode(); + if (minZeroCount == maxZeroCount) { + // Return (0{x})? + IntegerNode integer = new IntegerNode(minZeroCount); + repetition.addChild(integer); + } else { + // Return (0{x,y})? or (0{x,})? if unlimited max. + IntegerRangeNode integerRange = new IntegerRangeNode(); + integerRange.setStart(minZeroCount); + if (maxZeroCount != -1) { + integerRange.setEnd(maxZeroCount); + } + repetition.addChild(integerRange); + } + + groupNode.addChild(repetition); + } + nodes.add(groupNode); + // Ensure the group is optional. + nodes.add(new QuestionMarkNode()); + } + + return nodes; + } + + /** + * Trim all trailing nodes that explicitly only match zero. Trimming will stop once the first element that can match something other than zero is seen. + * + * @param nodes + * the nodes + * @return a list of trimmed nodes + */ + private List trimTrailingZeroOnlyElements(List nodes) { + NodeListIterator iter = new NodeListIterator(nodes); + + while (iter.hasNext()) { + // Keep a record of the current index so that we can reset it once we find an element that cannot match zero. + int lastIndex = iter.index(); + // Skip past any question marks or quantifiers that are before the element. Remember, the node list is in reverse order. + iter.seekPastQuestionMarks(); + iter.seekPastQuantifiers(); + Node next = iter.peekNext(); + // If the next element matches zero only, skip past it. + if (RegexUtils.matchesZeroOnly(next)) { + iter.next(); + } else { + // Reset the index to the non-zero matching element. + iter.setIndex(lastIndex); + break; + } + } + + // If no trailing zeros were seen, return the original list, otherwise return a sublist. + return iter.index() == 0 ? nodes : new ArrayList<>(nodes.subList(iter.index(), nodes.size())); + } + + /** + * Return a list with all possible trailing zeros consolidated, and any elements made optional as needed. + * + * @param nodes + * the nodes to consolidate + * @return a list of consolidated nodes + */ + private List consolidatePossibleTrailingZeros(List nodes) { + // List of consolidated nodes. + List consolidated = new ArrayList<>(); + NodeListIterator iter = new NodeListIterator(nodes); + + // Check if the pattern ends with '.+' or '.+?'. In this case, the '.+' must become a '.*' to allow for matching against numbers that had trailing zeros + // that were subsequently trimmed when encoded. + if (iter.hasNext()) { + int lastIndex = iter.index(); + Node questionMark = iter.isNextQuestionMark() ? iter.next() : null; + Node quantifier = iter.isNextQuantifier() ? iter.next() : null; + Node next = iter.next(); + // if the last element of the pattern is .+, convert it to .*. + if (next.getType() == NodeType.ANY_CHAR && quantifier != null && quantifier.getType() == NodeType.ONE_OR_MORE) { + if (questionMark != null) { + consolidated.add(questionMark); + } + consolidated.add(new ZeroOrMoreNode()); + consolidated.add(new AnyCharNode()); + } else { + // Otherwise reset the index to the initial index. + iter.setIndex(lastIndex); + } + } + + // Iterate through each child. + while (iter.hasNext()) { + int lastIndex = iter.index(); + iter.seekPastQuestionMarks(); + iter.seekPastQuantifiers(); + + // Do not call next until we know the next node can match zero. + Node next = iter.peekNext(); + // The next node can match zero. Call next, and call the specific consolidation method based on whether the node can match only zero, or other + // numbers. + if (RegexUtils.matchesZero(next)) { + if (RegexUtils.matchesZeroOnly(next)) { + iter.setIndex(lastIndex); + consolidated.addAll(consolidateTrailingMatchesZeroOnly(iter)); + } else { + iter.setIndex(lastIndex); + consolidated.addAll(consolidateTrailingMatchesZero(iter)); + } + } else { + // Reset the index to the non-zero matching element. + iter.setIndex(lastIndex); + break; + } + } + + // Add the remaining nodes to the list to return. + while (iter.hasNext()) { + consolidated.add(iter.next()); + } + return consolidated; + } + + /** + * Consolidate any trailing zeros that can possibly match zero. + * + * @param iter + * the iterator + * @return the consolidated nodes. + */ + private List consolidateTrailingMatchesZero(NodeListIterator iter) { + List nodes = new ArrayList<>(); + while (iter.hasNext()) { + int lastIndex = iter.index(); + + // Skip past and capture the optional and quantifier for the node if present. + Node questionMark = iter.isNextQuestionMark() ? iter.next() : null; + Node quantifier = iter.isNextQuantifier() ? iter.next() : null; + Node next = iter.next(); + // The next node can match zero. The first call to next should always return an element that can match zero, but not only zero. + if (RegexUtils.matchesZero(next)) { + // If the next node had a quantifier, evaluate the quantifier. + if (quantifier != null) { + switch (quantifier.getType()) { + case ZERO_OR_MORE: + case ONE_OR_MORE: + // In both the case of * or + for a leading zero, we must ensure that * is used in the final regex to allow for zero occurrences of + // the leading zero when matching. + // If the quantifier was followed by ?, append the ?. + if (questionMark != null) { + nodes.add(questionMark); + } + nodes.add(new ZeroOrMoreNode()); + nodes.add(next); + break; + case REPETITION: + RepetitionNode repetition = (RepetitionNode) quantifier; + // If the repetition does not already allow for zero occurrences, we must create a new repetition quantifier that does so. + if (!RegexUtils.repetitionCanOccurZeroTimes(repetition)) { + if (RegexUtils.isNotRange(repetition)) { + // If the repetition is has the form {x}, replace it with {0,x}. For example, "[012]{3}" will become "[012]{0,3}". + // If the original quantifier was followed by ?, append it. + if (questionMark != null) { + nodes.add(questionMark); + } + nodes.add(RegexUtils.createRangeStartingFromZero(repetition)); + nodes.add(next); + } else { + // If the repetition has the form {x,y}, where x is a value greater than zero, we must wrap the element and the repetition + // in an optional group to allow for it to occur either zero times, or x-y times. For example, "[012]{3,5}" will become + // "([012]{3,5})?". Create a group node with the element and repetition as its children. + GroupNode groupNode = new GroupNode(); + groupNode.addChild(next); + groupNode.addChild(repetition); + // If the original quantifier was followed by ?, include it in the group. + if (questionMark != null) { + groupNode.addChild(questionMark); + } + // Make the group optional. + nodes.add(new QuestionMarkNode()); + nodes.add(groupNode); + } + } else { + // The repetition allows for zero occurrences. No modifications need to be made. + if (questionMark != null) { + nodes.add(questionMark); + } + nodes.add(repetition); + nodes.add(next); + } + break; + default: + throw new IllegalArgumentException("Unsupported quantifier type: " + quantifier.getType()); + } + } else { + // This is a single element. Make it optional. + nodes.add(new QuestionMarkNode()); + nodes.add(next); + } + + // If there are any elements after the current element that only match zero, consolidate then and add the result. + if (iter.hasNext()) { + lastIndex = iter.index(); + iter.seekPastQuestionMarks(); + iter.seekPastQuantifiers(); + if (RegexUtils.matchesZeroOnly(iter.peekNext())) { + iter.setIndex(lastIndex); + nodes.addAll(consolidateTrailingMatchesZeroOnly(iter)); + } else { + iter.setIndex(lastIndex); + } + } + } else { + // The next element cannot match zero. Nothing more to do. Reset the index to right before the non-zero element. + iter.setIndex(lastIndex); + break; + } + } + return nodes; + } + + /** + * Consolidate the next consecutive elements that can only match zero. + * + * @param iter + * the iterator + * @return a list of the consolidated nodes + */ + private List consolidateTrailingMatchesZeroOnly(NodeListIterator iter) { + // We need to track the minimum and maximum times a leading zero can occur. + int minZeroCount = 0; + int maxZeroCount = 0; + + while (iter.hasNext()) { + int lastIndex = iter.index(); + // Skip any question mark if present. + iter.seekPastQuestionMarks(); + // Grab the quantifier if present. + Node quantifier = iter.isNextQuantifier() ? iter.next() : null; + + // Do not call next until we've confirmed the next node only matches zero. + Node next = iter.peekNext(); + if (RegexUtils.matchesZeroOnly(next)) { + // Explicitly call next now. + iter.next(); + // If the zero has a quantifier, extract the quantifier range. + if (quantifier != null) { + Pair quantifierRange = RegexUtils.getQuantifierRange(quantifier); + // Increment the lower bound. + minZeroCount += quantifierRange.getLeft(); + if (maxZeroCount != -1) { + // If the quantifier range has no defined upper bound, that is equivalent to unlimited. Set the max bound to -1 to ensure it is not + // changed. + if (quantifierRange.getRight() == null) { + maxZeroCount = -1; + } else { + // Otherwise increment the upper bound. + maxZeroCount += quantifierRange.getRight(); + } + } + } else { + // The zero does not have a quantifier. Increment the min count by one, and increment the max count only if we have not yet determined that + // the max should be considered unlimited. + minZeroCount++; + if (maxZeroCount != -1) { + maxZeroCount++; + } + } + } else { + // If the next node does not only match zero, stop iterating. + iter.setIndex(lastIndex); + break; + } + } + + List nodes = new ArrayList<>(); + // Make the element optional. + nodes.add(new QuestionMarkNode()); + + // If the min and max are both 1, return 0? + if (minZeroCount == 1 && maxZeroCount == 1) { + nodes.add(new SingleCharNode(RegexConstants.ZERO)); + } else { + // Otherwise we need return 0 followed by a quantifier inside an optional group. + GroupNode groupNode = new GroupNode(); + groupNode.addChild(new SingleCharNode(RegexConstants.ZERO)); + + if (maxZeroCount == -1 && minZeroCount < 2) { + if (minZeroCount == 0) { + // Return (0*)? + groupNode.addChild(new ZeroOrMoreNode()); + } else if (minZeroCount == 1) { + // Return (0+)? + groupNode.addChild(new OneOrMoreNode()); + } + } else { + RepetitionNode repetition = new RepetitionNode(); + if (minZeroCount == maxZeroCount) { + // Return (0{x})? + IntegerNode integer = new IntegerNode(minZeroCount); + repetition.addChild(integer); + } else { + // Return (0{x,y})? or (0{x,})? if unlimited max. + IntegerRangeNode integerRange = new IntegerRangeNode(); + integerRange.setStart(minZeroCount); + if (maxZeroCount != -1) { + integerRange.setEnd(maxZeroCount); + } + repetition.addChild(integerRange); + } + + groupNode.addChild(repetition); + } + + nodes.add(groupNode); + } + + return nodes; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ZeroValueNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ZeroValueNormalizer.java new file mode 100644 index 00000000000..4351b38045f --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ZeroValueNormalizer.java @@ -0,0 +1,157 @@ +package datawave.data.normalizer.regex.visitor; + +import java.util.function.Consumer; + +import datawave.data.normalizer.regex.AlternationNode; +import datawave.data.normalizer.regex.ExpressionNode; +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.NodeListIterator; +import datawave.data.normalizer.regex.RegexConstants; +import datawave.data.normalizer.regex.RegexUtils; +import datawave.data.normalizer.regex.SingleCharNode; + +/** + * Implementation of {@link CopyVisitor} that: + *
    + *
  1. Simplifies any positive non-simple number patterns that can only match zero to {@code "0"}.
  2. + *
  3. Simplifies any negative non-simple number patterns that can only match zero to {@code "0"}.
  4. + *
  5. Identifies any negative non-simple number patterns that can match zero, and adds a {@code "0"} alternation.
  6. + *
+ */ +public class ZeroValueNormalizer extends SubExpressionVisitor { + + public static Node expand(Node node) { + if (node == null) { + return null; + } + ZeroValueNormalizer normalizer = new ZeroValueNormalizer(); + return (Node) node.accept(normalizer, null); + } + + @Override + protected Object visitSubExpression(Node node) { + // If the node represents a simple number, return a copy of it. + if (RegexUtils.isSimpleNumber(node)) { + return copy(node); + } + + return normalizePattern(node, RegexUtils.isNegativeRegex(node)); + } + + private Node normalizePattern(Node node, boolean negative) { + // If the pattern can only match zero, simplify it to just '0'. + if (matchesZeroOnly(node, negative)) { + return createZeroCharExpression(); + } + // If the pattern can match zero, add an alternation for '0'. + if (patternMatchesZero(node, negative)) { + AlternationNode alternation = new AlternationNode(); + alternation.addChild(node); + alternation.addChild(createZeroCharExpression()); + return new ExpressionNode(alternation); + } + // Otherwise the pattern can match numbers other than zero. Return a copy of it. + return copy(node); + } + + /** + * Return whether the given pattern will only match 0. + * + * @param node + * the node + * @param negative + * whether the pattern is negative + * @return true if the pattern will only match 0, or false otherwise + */ + private boolean matchesZeroOnly(Node node, boolean negative) { + // The minimum child count and index of the first non-minus sign node depends on whether the pattern is negative. + int minChildCount = negative ? 2 : 1; + int firstChild = negative ? 1 : 0; + if (node.getChildCount() == minChildCount) { + // If the minimum number of children is present, return whether it matches zero only. + return RegexUtils.matchesZeroOnly(node.getChildAt(firstChild)); + } else { + // If there are multiple children, return whether all children match zero only. + NodeListIterator iter = node.getChildrenIterator(); + // Skip past the minus sign if present. + if (negative) { + iter.next(); + } + // Seek past all elements that only match zero. + seekPastAllZeroOnlyElements(iter); + return !iter.hasNext(); + } + } + + /** + * Return true if the given negative pattern can match zero. + * + * @param node + * the negative pattern + * @return true if the pattern can match 0, or false otherwise + */ + private boolean patternMatchesZero(Node node, boolean negative) { + // If the child count is 2, there is only one node after the minus sign. Evaluate that by itself. + int minChildCount = negative ? 2 : 1; + int firstChild = negative ? 1 : 0; + if (node.getChildCount() == minChildCount) { + // If there is only one child, return whether it matches zero only. + return RegexUtils.matchesZero(node.getChildAt(firstChild)); + } else { + // If there are multiple children, return whether all children match zero only. + NodeListIterator iter = node.getChildrenIterator(); + if (negative) { + // Skip past the minus sign. + iter.next(); + } + // Seek past all elements that only match zero. + seekPastAllZeroMatchingElements(iter); + return !iter.hasNext(); + } + } + + /** + * Return a new {@link ExpressionNode} that contains the expression {@code "0"}. + * + * @return the new node + */ + private Node createZeroCharExpression() { + return new ExpressionNode(new SingleCharNode(RegexConstants.ZERO)); + } + + /** + * Seek past all consecutive elements that only match zero in the given iterator, including any after a decimal point. + * + * @param iterator + * the iterator + */ + private void seekPastAllZeroOnlyElements(NodeListIterator iterator) { + seekPast(iterator, NodeListIterator::seekPastZeroOnlyElements); + } + + /** + * Seek past all consecutive elements that can match zero in the given iterator, including any after a decimal point. + * + * @param iterator + * the iterator + */ + private void seekPastAllZeroMatchingElements(NodeListIterator iterator) { + seekPast(iterator, NodeListIterator::seekPastZeroMatchingElements); + } + + /** + * Seek past elements using the given delegate function. If a decimal point is present, seek past that as well. + * + * @param iter + * the iterator + * @param delegate + * the delegate function + */ + private void seekPast(NodeListIterator iter, Consumer delegate) { + delegate.accept(iter); + if (iter.hasNext() && RegexUtils.isDecimalPoint(iter.peekNext())) { + iter.next(); + delegate.accept(iter); + } + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/parser/GeometryParser.java b/core/utils/type-utils/src/main/java/datawave/data/parser/GeometryParser.java new file mode 100644 index 00000000000..52795afcc1d --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/parser/GeometryParser.java @@ -0,0 +1,22 @@ +package datawave.data.parser; + +import org.locationtech.jts.geom.Geometry; + +public abstract class GeometryParser implements Comparable { + + public static final int DEFAULT_PRIORITY = 0; + + public abstract Geometry parseGeometry(String geoString); + + // Used for sorting + // Smaller numbers have higher priority + protected abstract int getPriority(); + + @Override + public int compareTo(GeometryParser other) { + int compare = this.getPriority() - other.getPriority(); + if (compare == 0) + compare = this.getClass().getName().compareTo(other.getClass().getName()); + return compare; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/parser/WKBParser.java b/core/utils/type-utils/src/main/java/datawave/data/parser/WKBParser.java new file mode 100644 index 00000000000..cc3bca67c7c --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/parser/WKBParser.java @@ -0,0 +1,34 @@ +package datawave.data.parser; + +import org.locationtech.jts.geom.Geometry; +import org.locationtech.jts.io.WKBReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.io.BaseEncoding; + +/** + * This class can be used to parse a geometry object from Base64 encoded well-known binary (WKB). + */ +public class WKBParser extends GeometryParser { + + private static final Logger log = LoggerFactory.getLogger(WKBParser.class); + + @Override + public Geometry parseGeometry(String geoString) { + Geometry geom = null; + try { + byte[] wkbBytes = BaseEncoding.base64().decode(geoString); + geom = new WKBReader().read(wkbBytes); + } catch (Exception e) { + if (log.isTraceEnabled()) + log.trace("Cannot parse WKB geometry from [" + geoString + "]"); + } + return geom; + } + + @Override + protected int getPriority() { + return DEFAULT_PRIORITY + 1; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/parser/WKTParser.java b/core/utils/type-utils/src/main/java/datawave/data/parser/WKTParser.java new file mode 100644 index 00000000000..083a21ca417 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/parser/WKTParser.java @@ -0,0 +1,39 @@ +package datawave.data.parser; + +import org.apache.commons.lang3.StringUtils; +import org.locationtech.jts.geom.Geometry; +import org.locationtech.jts.io.WKTReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class WKTParser extends GeometryParser { + + private static final Logger log = LoggerFactory.getLogger(WKTParser.class); + + private static final String[] geomTypes = new String[] {"GEOMETRY", "POINT", "LINESTRING", "POLYGON", "MULTIPOINT", "MULTILINESTRING", "MULTIPOLYGON", + "GEOMETRYCOLLECTION", "CIRCULARSTRING", "COMPOUNDCURVE", "CURVEPOLYGON", "MULTICURVE", "MULTISURFACE", "CURVE", "SURFACE", "POLYHEDRALSURFACE", + "TIN", "TRIANGLE"}; + private static final String[] zGeomTypes = new String[geomTypes.length]; + + static { + for (int i = 0; i < geomTypes.length; i++) + zGeomTypes[i] = geomTypes[i] + " Z"; + } + + @Override + public Geometry parseGeometry(String geoString) { + Geometry geom = null; + try { + geom = new WKTReader().read(StringUtils.replaceEach(geoString, zGeomTypes, geomTypes)); + } catch (Exception e) { + if (log.isTraceEnabled()) + log.trace("Cannot parse WKT geometry from [" + geoString + "]"); + } + return geom; + } + + @Override + protected int getPriority() { + return DEFAULT_PRIORITY; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/AbstractGeometryType.java b/core/utils/type-utils/src/main/java/datawave/data/type/AbstractGeometryType.java new file mode 100644 index 00000000000..27970caccef --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/AbstractGeometryType.java @@ -0,0 +1,99 @@ +package datawave.data.type; + +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; + +import org.locationtech.jts.geom.Geometry; +import org.locationtech.jts.geom.Polygon; + +import datawave.data.normalizer.DiscreteIndexNormalizer; +import datawave.data.normalizer.Normalizer; +import datawave.data.normalizer.OneToManyNormalizer; +import datawave.data.type.util.AbstractGeometry; + +/** + * The base GeoWave geometry type, which provides an implementation for the discrete index type interface. + * + * @param + * The underlying geometry type + */ +public abstract class AbstractGeometryType> extends BaseType implements DiscreteIndexType { + + private static final long GEOMETRY_FACTORY_SIZE = 120; + private static final long ENVELOPE_SIZE = 45; + private static final long GEOMETRY_BASE_SIZE = ENVELOPE_SIZE + 20; + private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF + Sizer.REFERENCE + GEOMETRY_FACTORY_SIZE; + + public AbstractGeometryType(Normalizer normalizer) { + super(normalizer); + } + + @Override + public String incrementIndex(String index) { + return ((DiscreteIndexNormalizer) normalizer).incrementIndex(index); + } + + @Override + public String decrementIndex(String index) { + return ((DiscreteIndexNormalizer) normalizer).decrementIndex(index); + } + + @Override + @SuppressWarnings("unchecked") + public List discretizeRange(String beginIndex, String endIndex) { + return ((DiscreteIndexNormalizer) normalizer).discretizeRange(beginIndex, endIndex); + } + + @Override + public boolean producesFixedLengthRanges() { + return ((DiscreteIndexNormalizer) normalizer).producesFixedLengthRanges(); + } + + @Override + public long sizeInBytes() { + long size = STATIC_SIZE + (2 * normalizedValue.length()); + + if (this instanceof OneToManyNormalizerType) { + List values = ((OneToManyNormalizerType) this).getNormalizedValues(); + size += 2 * values.stream().map(String::length).map(x -> x + Sizer.REFERENCE).reduce(Integer::sum).orElse(0); + } + + List leafGeometries = new ArrayList<>(); + LinkedList workingList = new LinkedList<>(); + workingList.push(delegate.getJTSGeometry()); + + while (!workingList.isEmpty()) { + Geometry geom = workingList.pop(); + + if (geom.getNumGeometries() > 1) { + size += Sizer.OBJECT_OVERHEAD; + + // push all the geometries to the working list + for (int i = 0; i < geom.getNumGeometries(); i++) { + workingList.push(geom.getGeometryN(i)); + } + } else if (geom instanceof Polygon) { + size += 2 * Sizer.OBJECT_OVERHEAD + GEOMETRY_BASE_SIZE; + + Polygon poly = (Polygon) geom; + + // push all the exterior and interior rings to the working list + workingList.push(poly.getExteriorRing()); + for (int i = 0; i < poly.getNumInteriorRing(); i++) { + workingList.push(poly.getInteriorRingN(i)); + } + + } else { + size += 3 * Sizer.OBJECT_OVERHEAD + GEOMETRY_BASE_SIZE; + leafGeometries.add(geom); + } + } + + for (Geometry geom : leafGeometries) { + size += Sizer.ARRAY_OVERHEAD + Sizer.OBJECT_OVERHEAD + geom.getCoordinates().length * (3 * 8 + Sizer.OBJECT_OVERHEAD + Sizer.REFERENCE) + + Sizer.REFERENCE; + } + return size; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/BaseType.java b/core/utils/type-utils/src/main/java/datawave/data/type/BaseType.java new file mode 100644 index 00000000000..9ae9b5c1a55 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/BaseType.java @@ -0,0 +1,175 @@ +package datawave.data.type; + +import java.io.Serializable; +import java.util.Collection; +import java.util.List; + +import datawave.data.normalizer.Normalizer; +import datawave.webservice.query.data.ObjectSizeOf; + +public class BaseType & Serializable> implements Serializable, Type, ObjectSizeOf { + + private static final long serialVersionUID = 5354270429891763693L; + private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF + Sizer.REFERENCE + Sizer.REFERENCE; + + protected T delegate; + protected String normalizedValue; + protected final Normalizer normalizer; + + public BaseType(String delegateString, Normalizer normalizer) { + this.normalizer = normalizer; + setDelegate(normalizer.denormalize(delegateString)); + } + + public BaseType(Normalizer normalizer) { + this.normalizer = normalizer; + } + + public T getDelegate() { + return delegate; + } + + public void setDelegateFromString(String in) { + setDelegate(normalizer.denormalize(in)); + } + + public void setDelegate(T delegate) { + this.delegate = delegate; + normalizeAndSetNormalizedValue(this.delegate); + } + + public String getNormalizedValue() { + return normalizedValue; + } + + @Override + public T denormalize() { + return this.delegate; + } + + public void setNormalizedValue(String normalizedValue) { + this.normalizedValue = normalizedValue; + } + + public int compareTo(Type o) { + return this.getDelegate().compareTo(o.getDelegate()); + } + + public String normalize() { + return normalizer.normalizeDelegateType(this.delegate); + } + + public String normalize(String in) { + return normalizer.normalize(in); + } + + public Collection expand(String in) { + return normalizer.expand(in); + } + + public Collection expand() { + return normalizer.expand(this.delegate.toString()); + } + + public T denormalize(String in) { + return normalizer.denormalize(in); + } + + @Override + public String normalizeRegex(String in) { + return normalizer.normalizeRegex(in); + } + + @Override + public boolean normalizedRegexIsLossy(String in) { + return normalizer.normalizedRegexIsLossy(in); + } + + @Override + public void normalizeAndSetNormalizedValue(T valueToNormalize) { + setNormalizedValue(normalizer.normalizeDelegateType(valueToNormalize)); + } + + public void validate() { + if (this.delegate == null || this.normalizedValue == null) + throw new IllegalArgumentException(this + " does not validate: " + delegate + "," + normalizedValue); + } + + private int delegateHashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((delegate == null) ? 0 : delegate.hashCode()); + return result; + } + + private boolean delegateEquals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + @SuppressWarnings("unchecked") + BaseType other = (BaseType) obj; + if (delegate == null) { + if (other.delegate != null) + return false; + } else if (!delegate.equals(other.delegate)) + return false; + return true; + } + + @Override + public int hashCode() { + if (delegate == null) { + // Use the concrete Type's full name to ensure that we don't get multiple + // instances of the same class (as Object#hashCode is based on virtual memory location) + return this.getClass().getName().hashCode(); + } else { + return delegateHashCode(); + } + } + + @Override + public boolean equals(Object o) { + if (delegate == null) { + Class otherClz = o.getClass(); + + // Since Types are considered to be stateless, + // we can treat equality as the same class + if (otherClz.equals(this.getClass())) { + return true; + } + return false; + } else { + return delegateEquals(o); + } + } + + @Override + public String getDelegateAsString() { + return toString(); + } + + @Override + public String toString() { + return delegate == null ? super.toString() : delegate.toString(); + } + + /** + * One string (normalizedValue) one unknown object (delegate) one normalizer (singleton reference) ref to object (4) normalizers will not be counted because + * they are singletons + * + * @return + */ + @Override + public long sizeInBytes() { + long size = 0; + if (this instanceof OneToManyNormalizerType) { + List values = ((OneToManyNormalizerType) this).getNormalizedValues(); + size += values.stream().map(String::length).map(length -> 2 * length + ObjectSizeOf.Sizer.REFERENCE).reduce(Integer::sum).orElse(0); + } + size += STATIC_SIZE + (2 * normalizedValue.length()) + ObjectSizeOf.Sizer.getObjectSize(delegate); + return size; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/DateType.java b/core/utils/type-utils/src/main/java/datawave/data/type/DateType.java new file mode 100644 index 00000000000..2659ef0a865 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/DateType.java @@ -0,0 +1,36 @@ +package datawave.data.type; + +import java.util.Date; + +import datawave.data.normalizer.Normalizer; + +public class DateType extends BaseType { + + private static final long serialVersionUID = 936566410691643144L; + private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF + PrecomputedSizes.DATE_STATIC_REF + Sizer.REFERENCE; + + public DateType() { + super(Normalizer.DATE_NORMALIZER); + } + + public DateType(String dateString) { + super(Normalizer.DATE_NORMALIZER); + super.setDelegate(normalizer.denormalize(dateString)); + } + + @Override + public String getDelegateAsString() { + // the normalized form of the date preserves milliseconds + return normalizer.normalizeDelegateType(getDelegate()); + } + + /** + * One string, one date object, one reference to the normalizer + * + * @return + */ + @Override + public long sizeInBytes() { + return STATIC_SIZE + (2 * normalizedValue.length()); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/DiscreteIndexType.java b/core/utils/type-utils/src/main/java/datawave/data/type/DiscreteIndexType.java new file mode 100644 index 00000000000..8352ad14dd1 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/DiscreteIndexType.java @@ -0,0 +1,54 @@ +package datawave.data.type; + +import java.util.List; + +/** + * Contains a collection of useful methods which can be used against index entries which are discrete and calculable. + * + * @param + */ +public interface DiscreteIndexType> extends Type { + + /** + * Increments the given index to the next logical value. + * + * If producesFixedLengthRanges is true, and incrementIndex would cause the length of the index to change, the original index will be returned. + * + * @param index + * @return an incremented index + */ + String incrementIndex(String index); + + /** + * Decrements the given index to the previous logical value. + * + * If producesFixedLengthRanges is true, and decrementIndex would cause the length of the index to change, the original index will be returned. + * + * @param index + * @return a decremented index + */ + String decrementIndex(String index); + + /** + * Returns a list of all discrete values between begin and end. + * + * If producesFixedLengthRanges is true, the returned values will be of the same length as begin and end. + * + * If producesFixedLengthRanges is true, and begin and end are of different lengths, the original range will be returned. + * + * If begin does not come before end, an empty list will be returned. + * + * @param beginIndex + * @param endIndex + * @return a list of the discrete index values between begin and end + */ + List discretizeRange(String beginIndex, String endIndex); + + /** + * Indicates whether or not the ranges against the given indices will be of fixed length. That is to say, whether or not all index values within a given + * range will have the same string length. This is an important characteristic which enables composite ranges to be created. + * + * @return whether query ranges against these values will be of fixed length + */ + boolean producesFixedLengthRanges(); +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/GeoLatType.java b/core/utils/type-utils/src/main/java/datawave/data/type/GeoLatType.java new file mode 100644 index 00000000000..e292c000f8c --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/GeoLatType.java @@ -0,0 +1,23 @@ +package datawave.data.type; + +import datawave.data.normalizer.Normalizer; + +public class GeoLatType extends BaseType { + + private static final long serialVersionUID = -2775239290833908032L; + private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF * 2 + Sizer.REFERENCE; + + public GeoLatType() { + super(Normalizer.GEO_LAT_NORMALIZER); + } + + /** + * Two String + normalizer reference + * + * @return + */ + @Override + public long sizeInBytes() { + return STATIC_SIZE + (2 * normalizedValue.length()) + (2 * delegate.length()); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/GeoLonType.java b/core/utils/type-utils/src/main/java/datawave/data/type/GeoLonType.java new file mode 100644 index 00000000000..2d34ff553f1 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/GeoLonType.java @@ -0,0 +1,23 @@ +package datawave.data.type; + +import datawave.data.normalizer.Normalizer; + +public class GeoLonType extends BaseType { + + private static final long serialVersionUID = 8912983433360105604L; + private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF * 2 + Sizer.REFERENCE; + + public GeoLonType() { + super(Normalizer.GEO_LON_NORMALIZER); + } + + /** + * Two String + normalizer reference + * + * @return + */ + @Override + public long sizeInBytes() { + return STATIC_SIZE + (2 * normalizedValue.length()) + (2 * delegate.length()); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/GeoType.java b/core/utils/type-utils/src/main/java/datawave/data/type/GeoType.java new file mode 100644 index 00000000000..85d1419603c --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/GeoType.java @@ -0,0 +1,23 @@ +package datawave.data.type; + +import datawave.data.normalizer.Normalizer; + +public class GeoType extends BaseType { + + private static final long serialVersionUID = 8429780512238258642L; + private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF * 2 + Sizer.REFERENCE; + + public GeoType() { + super(Normalizer.GEO_NORMALIZER); + } + + /** + * Two String + normalizer reference + * + * @return + */ + @Override + public long sizeInBytes() { + return STATIC_SIZE + (2 * normalizedValue.length()) + (2 * delegate.length()); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/GeometryType.java b/core/utils/type-utils/src/main/java/datawave/data/type/GeometryType.java new file mode 100644 index 00000000000..23e6e69ee81 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/GeometryType.java @@ -0,0 +1,43 @@ +package datawave.data.type; + +import java.util.List; + +import datawave.data.normalizer.Normalizer; +import datawave.data.normalizer.OneToManyNormalizer; +import datawave.data.type.util.Geometry; + +/** + * Provides inclusive support for all geometry types. OneToManyNormalizer support is needed as lines and polygons are likely to produce multiple normalized + * values during ingest. + */ +public class GeometryType extends AbstractGeometryType implements OneToManyNormalizerType { + + protected List normalizedValues; + + public GeometryType() { + super(Normalizer.GEOMETRY_NORMALIZER); + } + + public List normalizeToMany(String in) { + return ((OneToManyNormalizer) normalizer).normalizeToMany(in); + } + + public void setNormalizedValues(List normalizedValues) { + this.normalizedValues = normalizedValues; + setNormalizedValue(this.normalizedValues.toString()); + } + + @Override + public void normalizeAndSetNormalizedValue(Geometry valueToNormalize) { + setNormalizedValues(((OneToManyNormalizer) normalizer).normalizeDelegateTypeToMany(valueToNormalize)); + } + + public List getNormalizedValues() { + return normalizedValues; + } + + @Override + public boolean expandAtQueryTime() { + return false; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/HexStringType.java b/core/utils/type-utils/src/main/java/datawave/data/type/HexStringType.java new file mode 100644 index 00000000000..6528a5ba8a6 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/HexStringType.java @@ -0,0 +1,23 @@ +package datawave.data.type; + +import datawave.data.normalizer.Normalizer; + +public class HexStringType extends BaseType { + + private static final long serialVersionUID = -3480716807342380164L; + private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF * 2 + Sizer.REFERENCE; + + public HexStringType() { + super(Normalizer.HEX_STRING_NORMALIZER); + } + + /** + * Two String + normalizer reference + * + * @return + */ + @Override + public long sizeInBytes() { + return STATIC_SIZE + (2 * normalizedValue.length()) + (2 * delegate.length()); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/HitTermType.java b/core/utils/type-utils/src/main/java/datawave/data/type/HitTermType.java new file mode 100644 index 00000000000..f2bd7ba31a5 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/HitTermType.java @@ -0,0 +1,3 @@ +package datawave.data.type; + +public class HitTermType extends StringType {} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/IpAddressType.java b/core/utils/type-utils/src/main/java/datawave/data/type/IpAddressType.java new file mode 100644 index 00000000000..b0337b3d82d --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/IpAddressType.java @@ -0,0 +1,45 @@ +package datawave.data.type; + +import datawave.data.normalizer.IpAddressNormalizer; +import datawave.data.normalizer.Normalizer; +import datawave.data.type.util.IpAddress; +import datawave.data.type.util.IpV4Address; +import datawave.data.type.util.IpV6Address; + +public class IpAddressType extends BaseType { + + private static final long serialVersionUID = -6512690642978201801L; + private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF + Sizer.REFERENCE; + + public IpAddressType() { + super(Normalizer.IP_ADDRESS_NORMALIZER); + } + + public IpAddressType(String delegateString) { + super(delegateString, Normalizer.IP_ADDRESS_NORMALIZER); + } + + public String[] normalizeCidrToRange(String cidr) { + return ((IpAddressNormalizer) normalizer).normalizeCidrToRange(cidr); + } + + /** + * calculate the size based on the type of ip address type this is. Do not include the normalizer except a reference + * + * @return + */ + @Override + public long sizeInBytes() { + long base = STATIC_SIZE + (2 * normalizedValue.length()); + long ipSize; + if (delegate instanceof IpV4Address) { + ipSize = PrecomputedSizes.IPV4ADDRESS_STATIC_REF; + } else if (delegate instanceof IpV6Address) { + ipSize = PrecomputedSizes.IPV6ADDRESS_STATIC_REF; + } else { + // let the sizer figure it out + ipSize = Sizer.getObjectSize(delegate) + Sizer.REFERENCE; + } + return base + ipSize; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/IpV4AddressType.java b/core/utils/type-utils/src/main/java/datawave/data/type/IpV4AddressType.java new file mode 100644 index 00000000000..199e560ab1b --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/IpV4AddressType.java @@ -0,0 +1,36 @@ +package datawave.data.type; + +import datawave.data.normalizer.Normalizer; +import datawave.data.type.util.IpAddress; +import datawave.data.type.util.IpV4Address; +import datawave.data.type.util.IpV6Address; + +public class IpV4AddressType extends BaseType { + + private static final long serialVersionUID = 7214683578627273557L; + private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF + Sizer.REFERENCE; + + public IpV4AddressType() { + super(Normalizer.IP_ADDRESS_NORMALIZER); + } + + /** + * one String + either IpV4Address or IpV6Address + reference + * + * @return + */ + @Override + public long sizeInBytes() { + long base = STATIC_SIZE + (2 * normalizedValue.length()); + long ipSize; + if (delegate instanceof IpV4Address) { + ipSize = PrecomputedSizes.IPV4ADDRESS_STATIC_REF; + } else if (delegate instanceof IpV6Address) { + ipSize = PrecomputedSizes.IPV6ADDRESS_STATIC_REF; + } else { + // let the sizer figure it out + ipSize = Sizer.getObjectSize(delegate) + Sizer.REFERENCE; + } + return base + ipSize; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/LcNoDiacriticsListType.java b/core/utils/type-utils/src/main/java/datawave/data/type/LcNoDiacriticsListType.java new file mode 100644 index 00000000000..d9f33b2f426 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/LcNoDiacriticsListType.java @@ -0,0 +1,15 @@ +package datawave.data.type; + +import datawave.data.normalizer.Normalizer; + +public class LcNoDiacriticsListType extends ListType { + + public LcNoDiacriticsListType() { + super(Normalizer.LC_NO_DIACRITICS_NORMALIZER); + } + + public LcNoDiacriticsListType(String delegateString) { + super(delegateString, Normalizer.LC_NO_DIACRITICS_NORMALIZER); + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/LcNoDiacriticsType.java b/core/utils/type-utils/src/main/java/datawave/data/type/LcNoDiacriticsType.java new file mode 100644 index 00000000000..5f60475f5d4 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/LcNoDiacriticsType.java @@ -0,0 +1,27 @@ +package datawave.data.type; + +import datawave.data.normalizer.Normalizer; + +public class LcNoDiacriticsType extends BaseType { + + private static final long serialVersionUID = -6219894926244790742L; + private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF * 2 + Sizer.REFERENCE; + + public LcNoDiacriticsType() { + super(Normalizer.LC_NO_DIACRITICS_NORMALIZER); + } + + public LcNoDiacriticsType(String delegateString) { + super(delegateString, Normalizer.LC_NO_DIACRITICS_NORMALIZER); + } + + /** + * Two strings + normalizer reference + * + * @return + */ + @Override + public long sizeInBytes() { + return STATIC_SIZE + (2 * normalizedValue.length()) + (2 * delegate.length()); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/LcType.java b/core/utils/type-utils/src/main/java/datawave/data/type/LcType.java new file mode 100644 index 00000000000..7b2d3e456db --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/LcType.java @@ -0,0 +1,27 @@ +package datawave.data.type; + +import datawave.data.normalizer.Normalizer; + +public class LcType extends BaseType { + + private static final long serialVersionUID = -5102714749195917406L; + private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF * 2 + Sizer.REFERENCE; + + public LcType() { + super(Normalizer.LC_NORMALIZER); + } + + public LcType(String delegateString) { + super(delegateString, Normalizer.LC_NORMALIZER); + } + + /** + * Two String + normalizer reference + * + * @return + */ + @Override + public long sizeInBytes() { + return STATIC_SIZE + (2 * normalizedValue.length()) + (2 * delegate.length()); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/ListType.java b/core/utils/type-utils/src/main/java/datawave/data/type/ListType.java new file mode 100644 index 00000000000..e9a9a06d232 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/ListType.java @@ -0,0 +1,51 @@ +package datawave.data.type; + +import java.util.ArrayList; +import java.util.List; + +import datawave.data.normalizer.Normalizer; +import datawave.util.StringUtils; + +public abstract class ListType extends BaseType implements OneToManyNormalizerType { + protected static final String delimiter = ",|;"; + List normalizedValues; + + public ListType(Normalizer normalizer) { + super(normalizer); + } + + public ListType(String delegateString, Normalizer normalizer) { + super(delegateString, normalizer); + } + + @Override + public List normalizeToMany(String in) { + String[] splits = StringUtils.split(in, delimiter); + List strings = new ArrayList(splits.length); + for (String s : splits) { + + String str = normalizer.normalize(s); + strings.add(str); + + } + + return strings; + } + + @Override + public void setDelegateFromString(String in) { + this.normalizedValues = normalizeToMany(in); + this.delegate = in; + setNormalizedValue(in); + } + + @Override + public List getNormalizedValues() { + return normalizedValues; + } + + @Override + public boolean expandAtQueryTime() { + return false; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/MacAddressType.java b/core/utils/type-utils/src/main/java/datawave/data/type/MacAddressType.java new file mode 100644 index 00000000000..5095b8bd31f --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/MacAddressType.java @@ -0,0 +1,23 @@ +package datawave.data.type; + +import datawave.data.normalizer.Normalizer; + +public class MacAddressType extends BaseType { + + private static final long serialVersionUID = -6743560287574389073L; + private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF * 2 + Sizer.REFERENCE; + + public MacAddressType() { + super(Normalizer.MAC_ADDRESS_NORMALIZER); + } + + /** + * Two String + normalizer reference + * + * @return + */ + @Override + public long sizeInBytes() { + return STATIC_SIZE + (2 * normalizedValue.length()) + (2 * delegate.length()); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/NoOpType.java b/core/utils/type-utils/src/main/java/datawave/data/type/NoOpType.java new file mode 100644 index 00000000000..b24faa857f0 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/NoOpType.java @@ -0,0 +1,29 @@ +package datawave.data.type; + +import datawave.data.normalizer.Normalizer; + +public class NoOpType extends BaseType { + + private static final long serialVersionUID = 5316252096230974722L; + private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF * 2 + Sizer.REFERENCE; + + public NoOpType() { + super(Normalizer.NOOP_NORMALIZER); + } + + public NoOpType(String value) { + this(); + this.setDelegate(value); + super.setNormalizedValue(normalizer.normalize(value)); + } + + /** + * two identical strings + normalizer reference + * + * @return + */ + @Override + public long sizeInBytes() { + return STATIC_SIZE + (4 * normalizedValue.length()); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/NumberListType.java b/core/utils/type-utils/src/main/java/datawave/data/type/NumberListType.java new file mode 100644 index 00000000000..1331d95a3a0 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/NumberListType.java @@ -0,0 +1,15 @@ +package datawave.data.type; + +import datawave.data.normalizer.Normalizer; + +public class NumberListType extends ListType { + + public NumberListType() { + super(Normalizer.NUMBER_NORMALIZER); + } + + @Override + public boolean expandAtQueryTime() { + return true; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/NumberType.java b/core/utils/type-utils/src/main/java/datawave/data/type/NumberType.java new file mode 100644 index 00000000000..8fdf359a2e7 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/NumberType.java @@ -0,0 +1,27 @@ +package datawave.data.type; + +import java.math.BigDecimal; + +import datawave.data.normalizer.Normalizer; + +public class NumberType extends BaseType { + + private static final long serialVersionUID = 1398451215614987988L; + private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF + PrecomputedSizes.BIGDECIMAL_STATIC_REF + Sizer.REFERENCE; + + public NumberType() { + super(Normalizer.NUMBER_NORMALIZER); + } + + public NumberType(String delegateString) { + super(delegateString, Normalizer.NUMBER_NORMALIZER); + } + + /** + * one String, one BigDecimal and one reference to a normalizer + */ + @Override + public long sizeInBytes() { + return STATIC_SIZE + (2 * normalizedValue.length()); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/OneToManyNormalizerType.java b/core/utils/type-utils/src/main/java/datawave/data/type/OneToManyNormalizerType.java new file mode 100644 index 00000000000..6ea93b962b2 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/OneToManyNormalizerType.java @@ -0,0 +1,12 @@ +package datawave.data.type; + +import java.util.List; + +public interface OneToManyNormalizerType> extends Type { + + List normalizeToMany(String in); + + List getNormalizedValues(); + + boolean expandAtQueryTime(); +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/PointType.java b/core/utils/type-utils/src/main/java/datawave/data/type/PointType.java new file mode 100644 index 00000000000..36a3d01c1f6 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/PointType.java @@ -0,0 +1,14 @@ +package datawave.data.type; + +import datawave.data.normalizer.Normalizer; +import datawave.data.type.util.Point; + +/** + * Provides support for point geometry types. Other geometry types are not compatible with this type. + */ +public class PointType extends AbstractGeometryType { + + public PointType() { + super(Normalizer.POINT_NORMALIZER); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/RawDateType.java b/core/utils/type-utils/src/main/java/datawave/data/type/RawDateType.java new file mode 100644 index 00000000000..e1b894686ea --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/RawDateType.java @@ -0,0 +1,28 @@ +package datawave.data.type; + +import datawave.data.normalizer.Normalizer; + +public class RawDateType extends BaseType { + + private static final long serialVersionUID = 936566410691643144L; + private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF * 2 + Sizer.REFERENCE; + + public RawDateType() { + super(Normalizer.RAW_DATE_NORMALIZER); + } + + public RawDateType(String dateString) { + super(Normalizer.RAW_DATE_NORMALIZER); + super.setDelegate(normalizer.denormalize(dateString)); + } + + /** + * Two String + normalizer reference + * + * @return + */ + @Override + public long sizeInBytes() { + return STATIC_SIZE + (2 * normalizedValue.length()) + (2 * delegate.length()); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/StringType.java b/core/utils/type-utils/src/main/java/datawave/data/type/StringType.java new file mode 100644 index 00000000000..fee9a9039ae --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/StringType.java @@ -0,0 +1,23 @@ +package datawave.data.type; + +import datawave.data.normalizer.Normalizer; + +public class StringType extends BaseType { + + private static final long serialVersionUID = 8143572646109171126L; + private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF * 2 + Sizer.REFERENCE; + + public StringType() { + super(Normalizer.LC_NO_DIACRITICS_NORMALIZER); + } + + /** + * Two String + normalizer reference + * + * @return + */ + @Override + public long sizeInBytes() { + return STATIC_SIZE + (2 * normalizedValue.length()) + (2 * delegate.length()); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/TrimLeadingZerosType.java b/core/utils/type-utils/src/main/java/datawave/data/type/TrimLeadingZerosType.java new file mode 100644 index 00000000000..3237bb1e4e2 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/TrimLeadingZerosType.java @@ -0,0 +1,23 @@ +package datawave.data.type; + +import datawave.data.normalizer.Normalizer; + +public class TrimLeadingZerosType extends BaseType { + + private static final long serialVersionUID = -7425014359719165469L; + private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF * 2 + Sizer.REFERENCE; + + public TrimLeadingZerosType() { + super(Normalizer.TRIM_LEADING_ZEROS_NORMALIZER); + } + + /** + * Two String + normalizer reference + * + * @return + */ + @Override + public long sizeInBytes() { + return STATIC_SIZE + (2 * normalizedValue.length()) + (2 * delegate.length()); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/Type.java b/core/utils/type-utils/src/main/java/datawave/data/type/Type.java new file mode 100644 index 00000000000..9025a38c1f6 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/Type.java @@ -0,0 +1,56 @@ +package datawave.data.type; + +import java.util.Collection; + +public interface Type> extends Comparable> { + + String normalize(); + + T denormalize(); + + String normalize(String in); + + String normalizeRegex(String in); + + boolean normalizedRegexIsLossy(String in); + + Collection expand(String in); + + Collection expand(); + + T denormalize(String in); + + void setDelegate(T delegate); + + /** + * The string form must preserve all information in the delegate such that setDelegateFromString will recreate this instance correctly. + */ + String getDelegateAsString(); + + void setDelegateFromString(String str); + + T getDelegate(); + + void setNormalizedValue(String normalizedValue); + + String getNormalizedValue(); + + void normalizeAndSetNormalizedValue(T valueToNormalize); + + void validate(); + + class Factory { + + private Factory() { + // private constructor to enforce static access + } + + public static Type createType(String datawaveTypeClassName) { + try { + return (Type) Class.forName(datawaveTypeClassName).getDeclaredConstructor().newInstance(); + } catch (Exception e) { + throw new IllegalArgumentException("Error creating instance of class " + datawaveTypeClassName + ':' + e.getLocalizedMessage(), e); + } + } + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/TypeFactory.java b/core/utils/type-utils/src/main/java/datawave/data/type/TypeFactory.java new file mode 100644 index 00000000000..82e8c74dd80 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/TypeFactory.java @@ -0,0 +1,72 @@ +package datawave.data.type; + +import java.util.concurrent.TimeUnit; + +import com.google.common.cache.CacheBuilder; +import com.google.common.cache.CacheLoader; +import com.google.common.cache.LoadingCache; + +/** + * TypeFactory that uses an internal loading cache to limit new Type objects + */ +public class TypeFactory { + + private static final int DEFAULT_SIZE = 32; + private static final int DEFAULT_TIMEOUT_MINUTES = 15; + + private final LoadingCache> typeCache; + + /** + * Constructor that uses the default size and timeout + */ + public TypeFactory() { + this(DEFAULT_SIZE, DEFAULT_TIMEOUT_MINUTES); + } + + /** + * Constructor that uses custom size and timeout arguments + * + * @param size + * the cache size + * @param timeout + * the timeout in minutes + */ + public TypeFactory(int size, int timeout) { + // @formatter:off + typeCache = CacheBuilder.newBuilder() + .maximumSize(size) + .expireAfterWrite(timeout, TimeUnit.MINUTES) + .build(new CacheLoader<>() { + @Override + public Type load(String className) throws Exception { + Class clazz = Class.forName(className); + return (Type) clazz.getDeclaredConstructor().newInstance(); + } + }); + // @formatter:on + } + + /** + * Create a {@link Type} for the given class name + * + * @param className + * the class name + * @return the Type + */ + public Type createType(String className) { + try { + return typeCache.get(className); + } catch (Exception e) { + throw new IllegalStateException("Error creating instance of class " + className); + } + } + + /** + * Expose current cache size + * + * @return the current cache size + */ + public long getCacheSize() { + return typeCache.size(); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/util/AbstractGeometry.java b/core/utils/type-utils/src/main/java/datawave/data/type/util/AbstractGeometry.java new file mode 100644 index 00000000000..625cfc02465 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/util/AbstractGeometry.java @@ -0,0 +1,25 @@ +package datawave.data.type.util; + +import java.io.Serializable; + +/** + * The base GeoWave geometry delegate object, which wraps the underlying JTS geometry + * + * @param + * The underlying JTS Geometry + */ +public abstract class AbstractGeometry implements Serializable { + protected final T jtsGeom; + + public AbstractGeometry(T jtsGeom) { + this.jtsGeom = jtsGeom; + } + + public T getJTSGeometry() { + return jtsGeom; + } + + public String toString() { + return jtsGeom.toText(); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/util/Geometry.java b/core/utils/type-utils/src/main/java/datawave/data/type/util/Geometry.java new file mode 100644 index 00000000000..5b0ea6cb1be --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/util/Geometry.java @@ -0,0 +1,17 @@ +package datawave.data.type.util; + +import java.io.Serializable; + +/** + * This class operates as a delegate for any JTS Geometry instance. + */ +public class Geometry extends AbstractGeometry implements Comparable, Serializable { + public Geometry(org.locationtech.jts.geom.Geometry jtsGeom) { + super(jtsGeom); + } + + @Override + public int compareTo(Geometry o) { + return jtsGeom.compareTo(o.jtsGeom); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/util/IpAddress.java b/core/utils/type-utils/src/main/java/datawave/data/type/util/IpAddress.java new file mode 100644 index 00000000000..4a98b585059 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/util/IpAddress.java @@ -0,0 +1,44 @@ +package datawave.data.type.util; + +import java.io.Serializable; + +/** + * The general IpAddress + * + */ +public abstract class IpAddress implements Serializable, Comparable { + private static final long serialVersionUID = -8461591227664317046L; + + public abstract String toZeroPaddedString(); + + public abstract String toReverseString(); + + public abstract String toReverseZeroPaddedString(); + + public abstract IpAddress getStartIp(int validBits); + + public abstract IpAddress getEndIp(int validBits); + + /** + * Parse an address and return an appropriate representation + * + * @param address + * @return An IpV4 or IpV6 address + */ + public static IpAddress parse(String address) { + try { + return IpV4Address.parse(address); + } catch (IllegalArgumentException iae) { + return IpV6Address.parse(address); + } + } + + @Override + public boolean equals(Object o) { + if (o instanceof IpAddress) { + return (compareTo((IpAddress) o) == 0); + } else { + return false; + } + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/util/IpV4Address.java b/core/utils/type-utils/src/main/java/datawave/data/type/util/IpV4Address.java new file mode 100644 index 00000000000..bb151b192de --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/util/IpV4Address.java @@ -0,0 +1,373 @@ +package datawave.data.type.util; + +import com.google.common.base.Objects; +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; + +/** + * The IpV4 address + * + */ +public class IpV4Address extends IpAddress { + private static final long serialVersionUID = -3258500702340145500L; + private byte[] ipaddress = new byte[4]; + private int wildcardLoc = -1; + private int numOctets = 4; + + public IpV4Address(byte[] address) { + if (address.length != 4) { + throw new IllegalArgumentException("An IPV4 address must be 4 bytes in length"); + } + System.arraycopy(address, 0, this.ipaddress, 0, 4); + } + + public IpV4Address(byte[] address, int wildcardLoc, int numOctets) { + this.wildcardLoc = wildcardLoc; + this.numOctets = numOctets; + if (address.length != 4) { + throw new IllegalArgumentException("An IPV4 address must be 4 bytes in length"); + } + System.arraycopy(address, 0, this.ipaddress, 0, 4); + } + + public IpV4Address(long ipaddress) { + if ((ipaddress >>> 32) != 0) { + throw new IllegalArgumentException(ipaddress + " is out of range"); + } + this.ipaddress[3] = (byte) (0x00FF & (ipaddress >>> 0)); + this.ipaddress[2] = (byte) (0x00FF & (ipaddress >>> 8)); + this.ipaddress[1] = (byte) (0x00FF & (ipaddress >>> 16)); + this.ipaddress[0] = (byte) (0x00FF & (ipaddress >>> 24)); + } + + /** + * Return the underlying bytes + * + * @return the IpV4 address bytes + */ + public byte[] toBytes() { + return new byte[] {this.ipaddress[0], this.ipaddress[1], this.ipaddress[2], this.ipaddress[3]}; + } + + /** + * Return the underlying bytes in reverse order + * + * @return the IpV4 address bytes in reverse order + */ + public byte[] toReverseBytes() { + return new byte[] {this.ipaddress[3], this.ipaddress[2], this.ipaddress[1], this.ipaddress[0]}; + } + + /** + * Return the int representation of this address + * + * @return an int + */ + public long toNumber() { + long value = 0x00FF & ipaddress[0]; + value <<= 8; + value |= 0x00FF & ipaddress[1]; + value <<= 8; + value |= 0x00FF & ipaddress[2]; + value <<= 8; + value |= 0x00FF & ipaddress[3]; + return value; + } + + /** + * Return the int representation of this address + * + * @return an int + */ + public long toReverseNumber() { + long value = 0x00FF & ipaddress[3]; + value <<= 8; + value |= 0x00FF & ipaddress[2]; + value <<= 8; + value |= 0x00FF & ipaddress[1]; + value <<= 8; + value |= 0x00FF & ipaddress[0]; + return value; + } + + /** + * Parse an address assume the specified radix + * + * @param address + * @param radix + * The radix (e.g. 10 for decimal, 16 for hexidecimal, ...). 0 means that Number.decode() will be used + * @param dotted + * true if a dot notation, false if simply a number + * @return the IpV4 address + * @throws IllegalArgumentException + * if the radix is not 0, 10, 8, 16, or the address cannot be parsed + * @throws NumberFormatException + * if a number cannot be parsed using the specified radix + */ + public static IpV4Address parse(String address, int radix, boolean dotted) { + if (radix != 0 && radix != 10 && radix != 16 && radix != 8) { + throw new IllegalArgumentException("Radix " + radix + " is not 0, 8, 10, or 16"); + } + if (dotted) { + int wildcard = address.indexOf('*'); + String[] parts = Iterables.toArray(Splitter.on('.').split(address), String.class); + if (parts.length != 4 && wildcard == -1) { + throw new IllegalArgumentException("Expected 4 parts but got " + parts.length + " for " + address); + } else if (wildcard > -1) { + // if 1.1.* need to make it 001.001.000.000 and mark the wildcard location + byte[] ipaddress = new byte[4]; + int wc_octet = 0; + // work backwards + for (int i = 3; i >= 0; i--) { + if (i >= parts.length) { + // we need to pad + ipaddress[i] = (byte) 0; + wc_octet = i; + } else if (parts[i].isEmpty() || parts[i].equals("*")) { + // pad remainder with zeros and mark location + ipaddress[i] = (byte) 0; + wc_octet = i; + } else { + int value = 0; + if (!parts[i].isEmpty()) { + value = (radix == 0 ? Integer.decode(parts[i]) : Integer.parseInt(parts[i], radix)); + } + if ((value >>> 8) != 0) { + throw new IllegalArgumentException("Part " + parts[i] + " of " + address + " is out of range in radix " + radix); + } + ipaddress[i] = (byte) value; + } + } + return new IpV4Address(ipaddress, wc_octet, parts.length); + } else { + byte[] ipaddress = new byte[4]; + for (int i = 0; i < 4; i++) { + if ((radix == 0 && parts[i].length() > 4) || (radix == 10 && parts[i].length() > 3) || (radix == 16 && parts[i].length() > 2) + || (radix == 8 && parts[i].length() > 4)) { + throw new IllegalArgumentException("Part " + parts[i] + " of " + address + " is has too many digits for radix " + radix); + } + int value = 0; + if (!parts[i].isEmpty()) { + value = (radix == 0 ? Integer.decode(parts[i]) : Integer.parseInt(parts[i], radix)); + } + if ((value >>> 8) != 0) { + throw new IllegalArgumentException("Part " + parts[i] + " of " + address + " is out of range in radix " + radix); + } + ipaddress[i] = (byte) value; + } + return new IpV4Address(ipaddress); + } + } else { + long ipaddress = (radix == 0 ? Long.decode(address) : Long.parseLong(address, radix)); + if ((ipaddress >>> 32) != 0) { + throw new IllegalArgumentException(address + " is out of range in radix " + radix); + } + return new IpV4Address(ipaddress); + } + } + + /** + * Parse an address assume the specified radix. It attempts first as a dotted notation, then as a single number + * + * @param address + * @param radix + * 10 for decimal, 8 for octal, 16 for hexidecimal, 0 to use Number.decode + * @return An IpV4Address + * @throws IllegalArgumentException + * if the radix is not 0, 10, 8, 16, or the address cannot be parsed + * @throws NumberFormatException + * if a number cannot be parsed using the specified radix + */ + public static IpV4Address parse(String address, int radix) { + try { + return IpV4Address.parse(address, radix, true); + } catch (Exception iae) { + return IpV4Address.parse(address, radix, false); + } + } + + /** + * Parse an address. It attempts first as radix 10, then as radix 16, then as radix 8, then as radix 0 + * + * @param address + * @return An IpV4Address + * @throws IllegalArgumentException + * if it cannot be parsed + */ + public static IpV4Address parse(String address, boolean dotted) { + try { + return IpV4Address.parse(address, 10, dotted); + } catch (Exception iae10) { + try { + return IpV4Address.parse(address, 16, dotted); + } catch (Exception iae16) { + try { + return IpV4Address.parse(address, 8, dotted); + } catch (Exception iae8) { + return IpV4Address.parse(address, 0, dotted); + } + } + } + } + + /** + * Parse an address. It attempts first as radix 10, then as radix 16, then as radix 8, then as radix 0 + * + * @param address + * @return An IpV4Address + * @throws IllegalArgumentException + * if it cannot be parsed + */ + public static IpV4Address parse(String address) { + try { + return IpV4Address.parse(address, 10); + } catch (Exception iae10) { + try { + return IpV4Address.parse(address, 16); + } catch (Exception iae16) { + try { + return IpV4Address.parse(address, 8); + } catch (Exception iae8) { + return IpV4Address.parse(address, 0); + } + } + } + } + + public static String toString(byte[] address, boolean zeroPadded, int wc_loc, int numOctets, boolean reverse) { + StringBuilder builder = new StringBuilder(15); + for (int i = 0; i < address.length; i++) { + if (wc_loc != -1 && numOctets - 1 < i) { + break; + } + + if (builder.length() > 0) { + builder.append('.'); + } + + if (i == wc_loc) { + builder.append("*"); + if (wc_loc != 0) { + break; + } + } else { + String value = Integer.toString(0x00FF & address[i]); + if (zeroPadded) { + for (int j = value.length(); j < 3; j++) { + builder.append('0'); + } + } + builder.append(value); + } + + } + return builder.toString(); + } + + @Override + public String toString() { + return toString(ipaddress, false, this.wildcardLoc, this.numOctets, false); + } + + @Override + public String toZeroPaddedString() { + return toString(ipaddress, true, this.wildcardLoc, this.numOctets, false); + } + + @Override + public String toReverseString() { + if (wildcardLoc > -1) { + return toString(toReverseBytes(), false, 3 - this.wildcardLoc, this.numOctets, true); + } else { + return toString(toReverseBytes(), false, this.wildcardLoc, this.numOctets, true); + } + } + + @Override + public String toReverseZeroPaddedString() { + if (wildcardLoc > -1) { + return toString(toReverseBytes(), true, 3 - this.wildcardLoc, this.numOctets, true); + } else { + return toString(toReverseBytes(), true, this.wildcardLoc, this.numOctets, true); + } + } + + @Override + public IpAddress getStartIp(int validBits) { + byte[] ipaddress = new byte[4]; + for (int i = 0; i < 4; i++) { + if (validBits < 0) { + // Do nothing + } else if (validBits < 8) { + int shift = 8 - validBits; + ipaddress[i] = (byte) (((0x00FF >>> shift) << shift) & this.ipaddress[i]); + } else { + ipaddress[i] = this.ipaddress[i]; + } + validBits -= 8; + } + return new IpV4Address(ipaddress); + } + + @Override + public IpAddress getEndIp(int validBits) { + byte[] ipaddress = new byte[4]; + for (int i = 0; i < 4; i++) { + if (validBits < 0) { + ipaddress[i] = (byte) (0x00FF); + } else if (validBits < 8) { + ipaddress[i] = (byte) ((0x00FF >>> validBits) | this.ipaddress[i]); + } else { + ipaddress[i] = this.ipaddress[i]; + } + validBits -= 8; + } + return new IpV4Address(ipaddress); + } + + @Override + public int compareTo(IpAddress o) { + if (o instanceof IpV4Address) { + long i1 = toNumber(); + long i2 = ((IpV4Address) o).toNumber(); + if (i1 < i2) { + return -1; + } else if (i1 > i2) { + return 1; + } + } else if (o instanceof IpV6Address) { + IpV4Address addr = ((IpV6Address) o).toIpV4Address(); + if (addr == null) { + return -1; + } else { + return compareTo(addr); + } + } + return 0; + } + + @Override + public boolean equals(Object o) { + IpV4Address other = null; + if (o instanceof IpV6Address) { + other = ((IpV6Address) o).toIpV4Address(); + } else if (o instanceof IpV4Address) { + other = (IpV4Address) o; + } + if (null == other) { + return false; + } + + return Objects.equal(this.toNumber(), other.toNumber()); + } + + @Override + public int hashCode() { + int hashCode = 0; + for (int i = 0; i < 4; i++) { + hashCode += ipaddress[i]; + } + return hashCode; + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/util/IpV6Address.java b/core/utils/type-utils/src/main/java/datawave/data/type/util/IpV6Address.java new file mode 100644 index 00000000000..30b0ab75b3c --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/util/IpV6Address.java @@ -0,0 +1,262 @@ +package datawave.data.type.util; + +import org.apache.commons.lang3.StringUtils; + +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; + +/** + * The IpV6 address + * + */ +public class IpV6Address extends IpAddress { + private static final long serialVersionUID = -1528748156190096213L; + private short[] ipaddress = new short[8]; + + public IpV6Address(short[] address) { + if (address.length != 8) { + throw new IllegalArgumentException("An IpV6 address must be 8 shorts in length"); + } + System.arraycopy(address, 0, this.ipaddress, 0, address.length); + } + + /** + * Return the underlying short values + * + * @return the IpV6 address short values + */ + public short[] toShorts() { + return new short[] {this.ipaddress[0], this.ipaddress[1], this.ipaddress[2], this.ipaddress[3], this.ipaddress[4], this.ipaddress[5], this.ipaddress[6], + this.ipaddress[7]}; + } + + /** + * Return the underlying short values in reverse order + * + * @return the IpV6 address short values in reverse order + */ + public short[] toReverseShorts() { + return new short[] {this.ipaddress[7], this.ipaddress[6], this.ipaddress[5], this.ipaddress[4], this.ipaddress[3], this.ipaddress[2], this.ipaddress[1], + this.ipaddress[0]}; + } + + /** + * Parse an address assume the specified base + * + * @param address + * @return the IpV6 address + * @throws IllegalArgumentException + * if the base is not 10, 8, 16, or the address cannot be parsed using the specified base or dotted/not + */ + public static IpV6Address parse(String address) { + String[] parts = Iterables.toArray(Splitter.on(':').split(address), String.class); + if (parts.length > 8) { + throw new IllegalArgumentException("Expected no more than 8 parts but got " + parts.length + " for " + address); + } + // if less than 8 parts, then there must be a "::" somewhere in there or an IPv4 address at the end + boolean expectFiller = (address.contains("::")); + boolean expectIpv4 = (address.indexOf('.') >= 0); + if (!expectFiller) { + if (expectIpv4 && parts.length != 7) { + throw new IllegalArgumentException("Wrong number of sections in " + address); + } + } else { + if (expectIpv4 && parts.length > 7) { + throw new IllegalArgumentException("Wrong number of sections in " + address); + } + } + + short[] ipaddress = new short[8]; + int index = 0; + for (int i = 0; i < 8; i++) { + if (index >= parts.length) + throw new IllegalArgumentException("Error processing address " + address); + if (i == 6 && expectIpv4) { + byte[] bytes = IpV4Address.parse(parts[index]).toBytes(); + ipaddress[i++] = (short) (((0x00FF & bytes[0]) << 8) | (0x00FF & bytes[1])); + ipaddress[i] = (short) (((0x00FF & bytes[2])) << 8 | (0x00FF & bytes[3])); + } else if (parts[index].isEmpty() && expectFiller) { + i += (8 - parts.length); + if (expectIpv4) { + i--; + } + // can only have one of these + expectFiller = false; + } else { + int value = (!parts[index].isEmpty() ? Integer.parseInt(parts[index], 16) : 0); + if ((value >>> 16) != 0) { + throw new IllegalArgumentException("Part " + parts[i] + " of " + address + " is out of range in base 16"); + } + ipaddress[i] = (short) value; + } + index++; + + } + return new IpV6Address(ipaddress); + } + + public static String toString(short[] address, boolean zeroPadded, boolean skipZeros) { + StringBuilder builder = new StringBuilder(39); + int startSkip = -1; + int length = 0; + if (skipZeros) { + // find the longest sequence of zeros + int count = 0; + for (int i = 0; i < 8; i++) { + if (address[i] == 0) { + count++; + } else { + if (count > length) { + startSkip = i - count; + length = count; + } + count = 0; + } + } + if (count > length) { + startSkip = 8 - count; + length = count; + } + } + for (int i = 0; i < address.length; i++) { + if (i == startSkip) { + builder.append(':'); + i += length; + } + if (builder.length() > 0 && StringUtils.countMatches(builder.toString(), ":") < 7) { + // the countMatches test will prevent adding an extra : at the end and making it look like 9 tokens instead of the allowed max of 8 + builder.append(':'); + } + if (i < address.length) { + String value = Integer.toString(0x00FFFF & address[i], 16); + if (zeroPadded) { + for (int j = value.length(); j < 4; j++) { + builder.append('0'); + } + } + builder.append(value); + } + } + return builder.toString(); + } + + @Override + public String toString() { + return toString(ipaddress, false, true); + } + + @Override + public String toZeroPaddedString() { + return toString(ipaddress, true, false); + } + + @Override + public String toReverseString() { + return toString(toReverseShorts(), false, true); + } + + @Override + public String toReverseZeroPaddedString() { + return toString(toReverseShorts(), true, false); + } + + /** + * Return the IpV4Address representation if only the last 2 shorts are set + * + * @return the IpV4Address representation, null if not compatible with IpV4 + */ + public IpV4Address toIpV4Address() { + if (ipaddress[0] != 0 || ipaddress[1] != 0 || ipaddress[2] != 0 || ipaddress[3] != 0 || ipaddress[4] != 0 || ipaddress[5] != 0) { + return null; + } else { + return new IpV4Address(((0x00FFFFl & ipaddress[6]) << 16) | (0x00FFFFl & ipaddress[7])); + } + } + + @Override + public IpAddress getStartIp(int validBits) { + short[] ipaddress = new short[8]; + for (int i = 0; i < 8; i++) { + if (validBits < 0) { + // Do nothing + } else if (validBits < 16) { + int shift = 16 - validBits; + ipaddress[i] = (short) (((0x00FFFF >>> shift) << shift) & this.ipaddress[i]); + } else { + ipaddress[i] = this.ipaddress[i]; + } + validBits -= 16; + } + return new IpV6Address(ipaddress); + } + + @Override + public IpAddress getEndIp(int validBits) { + short[] ipaddress = new short[8]; + for (int i = 0; i < 8; i++) { + if (validBits < 0) { + ipaddress[i] = (short) (0x00FFFF); + } else if (validBits < 16) { + ipaddress[i] = (short) ((0x00FFFF >>> validBits) | this.ipaddress[i]); + } else { + ipaddress[i] = this.ipaddress[i]; + } + validBits -= 16; + } + return new IpV6Address(ipaddress); + } + + @Override + public int compareTo(IpAddress o) { + if (o instanceof IpV6Address) { + IpV6Address other = (IpV6Address) o; + return compareToIpV6Address(other); + } else { + IpV4Address addr = toIpV4Address(); + if (addr == null) { + return 1; + } else { + return addr.compareTo((IpV4Address) o); + } + } + } + + private int compareToIpV6Address(IpV6Address other) { + for (int i = 0; i < 8; i++) { + int comparison = compareSegments(ipaddress[i], other.ipaddress[i]); + if (comparison != 0) { + return comparison; + } + } + return 0; + } + + private int compareSegments(short x, short y) { + return (0x00FFFF & x) - (0x00FFFF & y); + } + + @Override + public boolean equals(Object o) { + if (o instanceof IpV6Address) { + return 0 == compareToIpV6Address(((IpV6Address) o)); + } else if (o instanceof IpV4Address) { + IpV4Address addr = this.toIpV4Address(); + if (addr == null) { + return false; + } else { + return addr.equals(o); + } + } + return false; + } + + @Override + public int hashCode() { + int hashCode = 0; + for (int i = 0; i < 8; i++) { + hashCode += ipaddress[i]; + } + return hashCode; + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/util/MACAddress.java b/core/utils/type-utils/src/main/java/datawave/data/type/util/MACAddress.java new file mode 100644 index 00000000000..33388938cd4 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/util/MACAddress.java @@ -0,0 +1,265 @@ +package datawave.data.type.util; + +import java.io.Serializable; +import java.util.Objects; + +/** + * Class to hold a MAC Address + */ +public class MACAddress implements Serializable, Comparable { + + private static final long serialVersionUID = 4366259028581959024L; + + /** + * String representation of the MAC address + */ + private String macAddress = ""; + + /** + * The separator used between digit groups. + */ + private String separator = ""; + + /** + * The size of the digit groups. + */ + private int groupingSize = 0; + + /** + * MAC addresses contain 12 digits + */ + private static final int MAC_ADDRESS_LENGTH = 12; + + /** + * The number of groupings + */ + private int groupings = 0; + + /** + * @param addr + * string representation of the MAC address + * @param sep + * separator used in the MAC address + * @param groupingSize + * size of the digit groups + */ + public MACAddress(String addr, String sep, int groupingSize) { + this.macAddress = addr; + this.separator = sep; + this.groupingSize = groupingSize; + this.groupings = MAC_ADDRESS_LENGTH / this.groupingSize; + } + + /** + * Normalize the string representation of the MAC address. Defaults to using a grouping size of 2 + * + * @param sep + * The separator to use in the normalized string. + * @return The normalized string + */ + public String toNormalizedString(String sep) { + return toNormalizedString(sep, 2); + } + + /** + * Normalize the string representation of the MAC address. + * + * @param sep + * The separator to use in the normalized string + * @param groupingSize + * The grouping size to use in the normalized string + * @return the normalized string + */ + public String toNormalizedString(String sep, int groupingSize) { + String returnAddress = new String(this.macAddress); + + if (!this.separator.equals("")) { + String sepRegex = new String(this.separator); + if (this.separator.matches("\\.")) { + sepRegex = "\\" + sepRegex; + } + returnAddress = returnAddress.replaceAll(this.separator, ""); + } + + String hexDigit = "([0-9a-fA-F])"; + StringBuilder hexDigits = new StringBuilder(); + // populate hexDigits as a regex to capture 12 hex digits + for (int i = 0; i < MAC_ADDRESS_LENGTH; i++) { + hexDigits.append(hexDigit); + } + + StringBuilder replacement = new StringBuilder(); + int groups = MAC_ADDRESS_LENGTH / groupingSize; + int totalStringLength = MAC_ADDRESS_LENGTH + groups - 1; + int digitCount = 1; + String sepRegex = new String(sep); + if (sepRegex.matches("\\.")) { + sepRegex = "\\" + sepRegex; + } + // populate replacement as a regex to properly format / separate the hex digits + for (int i = 1; i <= totalStringLength; i++) { + if (i % (groupingSize + 1) == 0) { + replacement.append(sepRegex); + } else { + replacement.append("$" + digitCount); + digitCount++; + } + } + + returnAddress = returnAddress.replaceAll(hexDigits.toString(), replacement.toString()); + returnAddress = returnAddress.toUpperCase(); + + return returnAddress; + } + + /** + * Attempt to parse a MAC address + * + * @param addr + * The MAC address + * @param sep + * The string separating hex digits + * @param groupingSize + * The size of the hex digit groups + * @param strict + * If true, will do extra checks to make sure it looks like a MAC address + * @return the MACAddress object + * @throws IllegalArgumentException + * if unable to parse out a MAC address + */ + public static MACAddress parse(String addr, String sep, int groupingSize, boolean strict) { + if (addr.contains(sep)) { + if (groupingSize < 1 || groupingSize > MAC_ADDRESS_LENGTH) { + throw new IllegalArgumentException("Grouping size must be between 1 and " + MAC_ADDRESS_LENGTH + ", inclusive."); + } + if (sep.matches("\\.")) { + sep = "\\" + sep; + } + String[] digits; + if (!sep.equals("")) { + digits = addr.split(sep); + } else { + digits = new String[1]; + digits[0] = addr; + } + int numberOfGroupings = MAC_ADDRESS_LENGTH / groupingSize; + if (digits.length != numberOfGroupings) { + throw new IllegalArgumentException("Address " + addr + " is not " + numberOfGroupings + " groups of digits divided by " + sep); + } + for (String digit : digits) { + if (digit.length() != groupingSize) { + throw new IllegalArgumentException("Digit block " + digit + " is not " + groupingSize + " digits."); + } + Long.parseLong(digit, 16); + } + // If this doesn't look like a standard MAC address, make sure it has hex digits to avoid picking up + // IPs, etc. + if (strict && ((groupingSize != 2 && groupingSize != 4) || (!sep.equals(".") && !sep.equals(":") && !sep.equals("-")))) { + String addrNoSep = new String(addr); + if (!sep.equals("")) { + addrNoSep = addrNoSep.replaceAll(sep, ""); + } + try { + Long.parseLong(addrNoSep, 10); + throw new IllegalArgumentException("This has no hex strings, probably not a mac address"); + } catch (NumberFormatException e) { + // This is OK, means it has hex digits + } + } + return new MACAddress(addr, sep, groupingSize); + } else { + throw new IllegalArgumentException("Address " + addr + " does not contain separator " + sep); + } + } + + /** + * Attempt to parse a MAC address The separator will be guessed based on the grouping size + * + * @param addr + * The MAC address + * @param groupingSize + * The size of the hex digit group + * @return the MAC address object + * @throws IllegalArgumentException + * if unable to parse a MAC address + */ + public static MACAddress parse(String addr, int groupingSize) { + if (groupingSize < 1 || groupingSize > MAC_ADDRESS_LENGTH) { + throw new IllegalArgumentException("Grouping size must be between 1 and " + MAC_ADDRESS_LENGTH + ", inclusive"); + } + String sep = ""; + if (groupingSize != MAC_ADDRESS_LENGTH) { + sep = String.valueOf(addr.charAt(groupingSize)); + } + + return parse(addr, sep, groupingSize, true); + } + + /** + * Attempt to parse a MAC address The grouping size will be guessed based on the separator + * + * @param addr + * the MAC address + * @param sep + * the separator + * @return the MAC address object + * @throws IllegalArgumentException + * if unable to parse a MAC address + */ + public static MACAddress parse(String addr, String sep) { + if (!addr.contains(sep)) { + throw new IllegalArgumentException("Separator " + sep + " not found in " + addr); + } + int groupingSize = addr.indexOf(sep); + return parse(addr, sep, groupingSize, true); + } + + /** + * Attempt to parse a MAC address The grouping size and separator will be guessed + * + * @param addr + * the MAC address + * @return the MAC address object + * @throws IllegalArgumentException + * if unable to parse a MAC address + */ + public static MACAddress parse(String addr) { + if (addr.matches("^[0-9a-fA-F]+$")) { + return parse(addr, "", MAC_ADDRESS_LENGTH, true); + } else if (addr.matches("^([0-9a-fA-F]+[^0-9a-fA-F])+[0-9a-fA-F]+$")) { + String[] pieces = addr.split("[^0-9a-fA-F]"); + int groupingSize = MAC_ADDRESS_LENGTH / pieces.length; + String sep = String.valueOf(addr.charAt(groupingSize)); + return parse(addr, sep, groupingSize, true); + } else { + throw new IllegalArgumentException("Unable to find separator in " + addr); + } + } + + @Override + public String toString() { + return this.macAddress; + } + + @Override + public int compareTo(MACAddress o) { + return this.toString().compareTo(o.toString()); + } + + @Override + public boolean equals(Object o) { + if (o instanceof MACAddress) { + /** + * Consider the MAC addresses equal if they have the same normalized string + */ + return this.toNormalizedString("").equals(((MACAddress) o).toNormalizedString("")); + } else { + return false; + } + } + + @Override + public int hashCode() { + return this.toNormalizedString("").hashCode(); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/util/NumericalEncoder.java b/core/utils/type-utils/src/main/java/datawave/data/type/util/NumericalEncoder.java new file mode 100644 index 00000000000..582703ff887 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/util/NumericalEncoder.java @@ -0,0 +1,206 @@ +package datawave.data.type.util; + +import java.math.BigDecimal; +import java.text.DecimalFormat; +import java.text.NumberFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.ListIterator; +import java.util.Map; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * Provides a one-to-one mapping between an input decimal number and a lexicographically sorted index for that number. The index is composed of two parts, + * roughly derived from scientific notation: the two digit exponential bin and the mantissa, with 'E' as a separator. Thus, an index takes this format: + * {@code 'bin'E'mantissa'}. + *

+ * The bins are broken into four groups: + *

    + *
  1. !A through !Z represent negative numbers with magnitude greater than one (exponents 25 through 0, respectively)
  2. + *
  3. !a through !z represent negative numbers with magnitude less than 1 (exponents -1 through -26, respectively)
  4. + *
  5. +A through +Z represent positive numbers with magnitude less than 1 (exponents -26 through -1, respectively)
  6. + *
  7. +a through +z represent positive numbers with magnitude greater than one (exponents 0 through 25, respectively)
  8. + *
+ * For positive numbers, the mantissa exactly matches the mantissa of scientific notation. For negative numbers, the mantissa equals ten minus the mantissa of + * scientific notation. + *

+ * Some example inputs and encodings: + *

    + *
  • -12344984165 becomes !PE8.7655015835
  • + *
  • -500 becomes !XE5
  • + *
  • -0.501 becomes !aE4.99
  • + *
  • 0 becomes +AE0
  • + *
  • 9E-9 becomes +RE9
  • + *
  • 0.501 becomes +ZE5.01
  • + *
  • 10000 becomes +eE1
  • + *
+ */ +public class NumericalEncoder { + + private static Map positiveNumsEncodeToIntExponentsMap; + private static Map positiveNumsIntToEncodeExponentsMap; + private static Map negativeNumEncodeToIntExponentsMap; + private static Map negativeNumIntToEncodeExponentsMap; + private static final NumberFormat plainFormatter = new DecimalFormat("0.#########################################################"); + private static final NumberFormat scientificFormatter = new DecimalFormat("0.#########################################################E0"); + private static final String zero = "+AE0"; + private static final List uppercaseLetters = createLetterList('A', 'Z'); + private static final List lowercaseLetters = createLetterList('a', 'z'); + private static final String encodedRegex = "(\\!|\\+)[a-zA-Z][E|e][0-9].?[0-9]*"; + private static final Pattern encodedPattern = Pattern.compile(encodedRegex); + + static { + initNegativeExponents(); + initPositiveExponents(); + } + + /** + * Return an unmodifiable list of letters in order from the given starting letter to the given ending letter. + * + * @param start + * the starting letter + * @param end + * the ending letter + * @return a list of letters + */ + private static List createLetterList(char start, char end) { + // @formatter:off + return Collections.unmodifiableList( + IntStream.rangeClosed(start, end) + .mapToObj(c -> "" + (char) c) + .collect(Collectors.toList())); + // @formatter:on + } + + public static String encode(String input) { + try { + BigDecimal decimal = new BigDecimal(input); + String encodedExponent; + String mantissa; + if (decimal.compareTo(BigDecimal.ZERO) == 0) { + return zero; + } else if (decimal.compareTo(BigDecimal.ZERO) > 0) { + // Positive + String decString = scientificFormatter.format(decimal); + String[] decParts = decString.split("E"); + mantissa = decParts[0]; + String exp = decParts[1]; + encodedExponent = positiveNumsIntToEncodeExponentsMap.get(exp); + } else { + // Negative + String decString = scientificFormatter.format(decimal); + String[] decParts = decString.split("E"); + mantissa = decParts[0]; + String exp = decParts[1]; + encodedExponent = negativeNumIntToEncodeExponentsMap.get(exp); + BigDecimal bigDecMantissa = new BigDecimal(mantissa); + bigDecMantissa = BigDecimal.TEN.add(bigDecMantissa); + mantissa = plainFormatter.format(bigDecMantissa); + + } + + if (encodedExponent == null) { + throw new NumberFormatException("Exponent exceeded allowed range."); + } + + return encodedExponent + "E" + mantissa; + } catch (Exception ex) { + throw new IllegalArgumentException("Error formatting input: " + input + " . Error: " + ex, ex); + } + } + + /** + * This provides a quick test that will determine whether this value is possibly encoded. Provides a mechanism that is significantly faster than waiting for + * the decode method to throw an exception. + * + * @param input + * the value to test for encoding + * @return true if possibly encoded, false if definitely not encoded + */ + public static boolean isPossiblyEncoded(String input) { + if (null == input || input.isEmpty()) + return false; + + return encodedPattern.matcher(input).matches(); + } + + public static BigDecimal decode(String input) { + BigDecimal output; + if (input.equals(zero)) { + return BigDecimal.ZERO; + } else { + try { + String exp = input.substring(0, 2); + String mantissa = input.substring(3); + if (exp.contains("+")) { + // Positive Number + exp = positiveNumsEncodeToIntExponentsMap.get(exp); + output = new BigDecimal(mantissa + "E" + exp); + } else if (exp.contains("!")) { + // Negative Number + exp = negativeNumEncodeToIntExponentsMap.get(exp); + output = new BigDecimal(mantissa).subtract(BigDecimal.TEN).movePointRight(Integer.parseInt(exp)); + } else { + throw new NumberFormatException("Unknown encoded exponent"); + } + + } catch (Exception ex) { + throw new IllegalArgumentException("Error decoding output: " + input + " . Error: " + ex, ex); + } + } + return output; + } + + public static char getPositiveBin(int index) { + return positiveNumsIntToEncodeExponentsMap.get(String.valueOf(index)).charAt(1); + } + + public static char getNegativeBin(int index) { + return negativeNumIntToEncodeExponentsMap.get(String.valueOf(index)).charAt(1); + } + + private static void initPositiveExponents() { + // The order of the encoded characters here maps directly to how their corresponding exponent value is calculated, and must not be changed. + List exponents = new ArrayList<>(); + uppercaseLetters.stream().map(letter -> "+" + letter).forEach(exponents::add); + lowercaseLetters.stream().map(letter -> "+" + letter).forEach(exponents::add); + Map map = createExponentMap(exponents); + positiveNumsEncodeToIntExponentsMap = Collections.unmodifiableMap(map); + positiveNumsIntToEncodeExponentsMap = Collections.unmodifiableMap(invertMap(map)); + } + + private static void initNegativeExponents() { + // The order of the encoded characters here maps directly to how their corresponding exponent value is calculated, and must not be changed. + List exponents = new ArrayList<>(); + // Iterate in reverse. + ListIterator iterator = lowercaseLetters.listIterator(lowercaseLetters.size()); + while (iterator.hasPrevious()) { + exponents.add("!" + iterator.previous()); + } + // Iterate in reverse. + iterator = uppercaseLetters.listIterator(uppercaseLetters.size()); + while (iterator.hasPrevious()) { + exponents.add("!" + iterator.previous()); + } + Map map = createExponentMap(exponents); + negativeNumEncodeToIntExponentsMap = Collections.unmodifiableMap(map); + negativeNumIntToEncodeExponentsMap = Collections.unmodifiableMap(invertMap(map)); + } + + private static Map createExponentMap(List exponents) { + Map map = new HashMap<>(); + for (int pos = 0; pos < exponents.size(); pos++) { + int exponent = pos - 26; + map.put(exponents.get(pos), String.valueOf(exponent)); + } + return map; + } + + private static Map invertMap(Map map) { + return map.entrySet().stream().collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey)); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/util/PhoneNumber.java b/core/utils/type-utils/src/main/java/datawave/data/type/util/PhoneNumber.java new file mode 100644 index 00000000000..c32532b5b67 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/util/PhoneNumber.java @@ -0,0 +1,465 @@ +package datawave.data.type.util; + +import java.io.Serializable; + +/** + * This normalizer is aiming to remove non-digits from phone numbers. + */ +public class PhoneNumber implements Serializable, Comparable { + + private String originalPhoneNumber = ""; + private String normalizedPhoneNumber = ""; + + /** + * A valid phone number must contain at least 7 digits. + */ + private static int MIN_LENGTH = 7; + + /** + * Phone numbers cannot have more than 15 digits. + */ + private static int MAX_LENGTH = 15; + + /** + * Valid digits. + */ + private static String DIGITS = "0123456789"; + + /** + * @param number + * the phone number string + */ + public PhoneNumber(String number) { + this(number, number); + } + + /** + * @param number + * the original phone number string + * @param normalizedNumber + * the normalized phone number string + */ + public PhoneNumber(String number, String normalizedNumber) { + this.originalPhoneNumber = number; + this.normalizedPhoneNumber = normalizedNumber; + } + + /** + * The only normalization this method does is removing spaces and punctuation from the phone number string. + * + * @return the normalized phone number + */ + public String toNormalizedString() { + return this.normalizedPhoneNumber; + } + + /** + * Parse a string and pull out a valid phone number if it exists. + * + * @param number + * string to look for a phone number in + * @return PhoneNumber object for found phone number + * @throws IllegalArgumentException + * if parameter doesn't contain a valid phone number + */ + public static PhoneNumber parse(String number) { + String num = basicPhoneNumberCheck(number); + + return isValid(num); + } + + /** + * Perform checks to see if there's a valid phone number. + * + * @param number + * the phone number to check. Should be pre-processed to remove leading/trailing words. + * @return PhoneNumber object for found phone number + * @throws IllegalArgumentException + * if parameter isn't a valid phone number + */ + private static PhoneNumber isValid(String number) { + char[] data = number.toCharArray(); + char[] num = new char[data.length]; + int pos = 0; + int nonDigitCount = 0; + int plusCount = 0; + int dashCount = 0; + int dotCount = 0; + int spaceCount = 0; + int openCount = 0; + int closCount = 0; + int currentSpanLength = 0; + int numSingleDigitSpans = 0; + int numSingleZeroSpans = 0; + int consecutiveDigit = 0; + int maxDigitSpan = 0; + int start = 0; + int end = data.length - 1; + + /** + * This normalizer is just worrying about stripping punctuation from phone numbers, so if this is a string of digits, just return instead of doing the + * other checks. + */ + if (number.matches("^\\d+$")) { + return new PhoneNumber(number); + } + + for (int i = 0; i < data.length; i++) { + if (isDigit(data[i])) { + currentSpanLength++; + if (pos > 0 && data[i] == num[pos - 1]) { + if (consecutiveDigit++ > 4) { + throw new IllegalArgumentException("No more than 4 in a row of the same digit is permitted."); + } + } + + num[pos++] = data[i]; + + continue; + } + + if (currentSpanLength > maxDigitSpan) { + maxDigitSpan = currentSpanLength; + } + + if (currentSpanLength == 1) { + if (num[pos - 1] == '0') { + ++numSingleZeroSpans; + } else { + ++numSingleDigitSpans; + } + + if (numSingleZeroSpans > 1 || numSingleDigitSpans > 1) { + throw new IllegalArgumentException("No more than one single digit and one single zero spans are permitted."); + } + } + + currentSpanLength = 0; + if (i > start && (!isDigit(data[i - 1]) && !(data[i - 1] == ' ' || data[i] == ' ') + && !((data[i - 1] == '+' && data[i] == '(') || (data[i - 1] == '(' && data[i] == '+')))) { + if (!((data[i - 1] == ')' && data[i] == '-') || (data[i - 1] == '-' && data[i] == '('))) { + throw new IllegalArgumentException("No more than one consecutive punctuation charachter is permitted except for '+(' or '(+'"); + } + } + + if (data[i] == ' ') { + if (i > 3 && data[i - 1] == '-' && data[i - 2] == ' ') { + --spaceCount; + nonDigitCount -= 2; + } else { + spaceCount++; + if (spaceCount > 5 || (spaceCount > 4 && pos < 11 && num[0] != '0')) { + throw new IllegalArgumentException("Too many spaces found"); + } + } + } + + if (i > 0 && data[i] == ' ' && data[i - 1] == ' ') { + throw new IllegalArgumentException("No more than one consecutive space is permitted."); + } + + if (data[i] == '(') { + openCount++; + } else if (data[i] == ')') { + closCount++; + } else if (data[i] == '+' && ++plusCount > 1) { + throw new IllegalArgumentException("Only one plus sign is allowed."); + } else if (data[i] == '-' && (++dashCount > 3 || i == start)) { + throw new IllegalArgumentException("Only three dashes are allowed, and leading dashes are prohibited."); + } else if (data[i] == '.' && ++dotCount > 2) { + throw new IllegalArgumentException("Only two dots are allowed."); + } + + if (++nonDigitCount > 7) { + throw new IllegalArgumentException("Only seven non-digit characters are allowed."); + } + } + + String s = new String(num, 0, pos); + + if (dotCount > 0 && dashCount > 0) { + throw new IllegalArgumentException("Only one of dots or dashes can be used."); + } else if (pos == MAX_LENGTH && num[0] != '0') { + throw new IllegalArgumentException("With max length number there must be a leading zero"); + } + + int countLeadingZeroOrOne = 0; + int ix = 0; + while (ix < pos && (num[ix] == '0' || num[ix] == '1')) { + ix++; + } + + if (pos < MIN_LENGTH + ix) { + throw new IllegalArgumentException("Ignoring leading zeroes and ones, the number is not long enough"); + } else if (ix + 3 < pos && num[ix] == num[ix + 1] && num[ix] == num[ix + 2] && num[ix] == num[ix + 3] && num[ix] != 8) { + throw new IllegalArgumentException( + "No more than three consecutive same digits after the leading ones and zeroes are permitted unless the digit is '8'."); + } else if (dotCount == 1 && (pos > 7 || pos == 7 && data[start + 3] != '.')) { + throw new IllegalArgumentException("If the number contains only one dot, it must contain 7 digits in the form XXX.XXXX"); + } else if (openCount + closCount > 0 && openCount != closCount) { + throw new IllegalArgumentException("Parenthesis mis-match"); + } else if (dotCount + dashCount + plusCount + openCount + closCount == 0 && (pos < 8 || (pos > 11 && spaceCount < 3 && currentSpanLength < 5))) { + throw new IllegalArgumentException("Number is the wrong length to have no puctuation but spaces."); + } else if (pos < 8 && (num[0] == '1' || num[0] == '0')) { + throw new IllegalArgumentException("Number is too short to have a leading one or zero"); + } else if (num[0] == '0' && num[1] == '0' && num[2] == '0') { + throw new IllegalArgumentException("Too many leading zeroes."); + } else if (currentSpanLength < 3 && pos < 10) { + throw new IllegalArgumentException("Valid numbers must be longer to end in a digit span of one or two."); + } + + if (data[start] != '+' && isISBN(s) && (spaceCount > 0 || dashCount > 0 || dotCount > 0) && (openCount + closCount) == 0) { + throw new IllegalArgumentException("Looks like an ISBN"); + } else if (number.matches("^\\d\\d\\d([ \\-])\\d\\d\\1\\d\\d\\d\\d$")) { + throw new IllegalArgumentException(number + " looks like a SSN"); + } else if (number.matches("^[12]\\d\\d\\d ?- ?[12]\\d\\d\\d$")) { + throw new IllegalArgumentException(number + " looks like a year range"); + } else if (number.matches("^(19|20)\\d\\d([\\-\\. ])[01]\\d\\2[0-3]\\d$")) { + throw new IllegalArgumentException(number + " looks like a yyyy mm dd date"); + } else if (number.matches("^(19|20)\\d\\d[01]\\d[0-3]\\d$")) { + throw new IllegalArgumentException(number + " looks like a yyyymmdd date"); + } else if (number.matches("^(19|20)\\d\\d([\\-\\. ])?[0-3]\\d\\2[01]\\d$")) { + throw new IllegalArgumentException(number + " looks like a yyyy dd mm date"); + } else if (number.matches("^(19|20)\\d\\d[0-3]\\d[01]\\d$")) { + throw new IllegalArgumentException(number + " looks like a yyyyddmm date"); + } else if (number.matches("^[0-3]\\d([\\-\\.])[01]\\d\\1(19|20)\\d\\d ([0-1]\\d|2[0-4])$")) { + throw new IllegalArgumentException(number + " looks like a dd-mm-yyyy hh:mm date"); + } else if (number.matches("^[0-3]\\d([\\-\\.])[1-9]\\1(19|20)\\d\\d ([0-1]\\d|2[0-4])$")) { + throw new IllegalArgumentException(number + " looks like a dd-mm-yyyy hh:mm date"); + } else if (number.matches("^(19|20)\\d\\d([\\-\\. ])([0-2]\\d\\d|3[0-5]\\d|36[0-6])$")) { + throw new IllegalArgumentException(number + " looks like a yyyy jjj date"); + } + + return new PhoneNumber(number, s); + } + + /** + * This will go through the data string looking for a phone number. + * + * @param number + * The data to look for phone numbers in + * @return A string containing what is believed to be a phone number + * @throws IllegalArgumentException + * If data does not contain a possible phone number + */ + private static String basicPhoneNumberCheck(String number) { + /** + * This normalizer is just worrying about stripping punctuation from phone numbers, so if this is a string of digits, just return instead of doing the + * other checks. + */ + if (number.matches("^\\d+$")) { + return number; + } + + char[] data = number.toCharArray(); + + if (data == null) { + throw new IllegalArgumentException("The character array of the string argument is null"); + } else if (data.length < MIN_LENGTH) { + throw new IllegalArgumentException("The data must be at least " + MIN_LENGTH + " characters long. Found " + data.length + " characters."); + } + + // trim down the string to pick out phone numbers + for (int i = MIN_LENGTH; i < data.length; i++) { + if (!isDigit(data[i])) { + continue; + } + int start = i - 1; + + while (start >= 0 && isPhoneNumberCharacter(data[start]) && (i - start) <= MAX_LENGTH) { + if ((!isDigit(data[start])) && data[start] == data[start + 1]) { + break; + } + start--; + } + + if (start == -1 || !isPhoneNumberCharacter(data[start])) { + start++; + } + + int seqlen = countDigits(data, start, i); + if (seqlen < MIN_LENGTH || seqlen > MAX_LENGTH) { + continue; + } + + if (start > 1 && data[start - 1] == ':' && isDigit(data[start - 2])) { + boolean spaceok = false; + for (int j = start; j < i; j++) { + spaceok = i - j >= MIN_LENGTH; + if (spaceok) { + start = j + 1; + } + break; + } + if (!spaceok) { + continue; + } + } + + while (data[start] == ')' || data[start] == ' ' || data[start] == '.' || data[start] == '-') { + start++; + } + + while (i + 1 < data.length && isPhoneNumberCharacter(data[i + 1])) { + i++; + } + + while (data[i] == ' ') { + i--; + } + + int lastSpace = i; + while (lastSpace > start && data[lastSpace] != ' ') { + lastSpace--; + } + + if (lastSpace < i && lastSpace > start) { + while (!isDigit(data[i]) && i >= lastSpace) { + i--; + } + + while (data[i] == ' ') { + i--; + } + } + + String rawString = new String(data, start, i - start + 1); + if (start > 0 && Character.isLetter(data[start - 1])) { + continue; + } else if (i < data.length - 2 && data[i + 1] == ',' && isDigit(data[i + 2])) { + continue; + } else if (i < data.length - 3 && data[i + 1] == ']' && data[i + 2] == ')' && start > 2 && data[start - 1] == '[' && data[start - 2] == '(') { + continue; + } else if (countDigits(data, start, i) > MAX_LENGTH) { + continue; + } + + if (data[start] == '+' && data[start + 1] == '+') { + start++; + } + + if (data[i] == '.') { + i--; + } + + if (i < data.length - 1 && ((Character.isLetter(data[i + 1]) && data[i + 1] != 'x' && data[i] != 'X') || data[i + 1] == '_' || data[i + 1] == '=' + || data[i + 1] == '?' || data[i + 1] == '\\')) { + continue; + } else if (i < data.length - 2 && data[i + 1] == '/' && !isDigit(data[i + 2])) { + continue; + } else if (i < data.length - 2 && data[i + 1] == '.' && !Character.isWhitespace(data[i + 2])) { + continue; + } else if (start > 0 && (data[start - 1] == '=' || data[start - 1] == '*' || data[start - 1] == '/' || data[start - 1] == '_' + || data[start - 1] == '?' || data[start - 1] == ',' || data[start - 1] == '$' + || (i + 1 < data.length && data[start - 1] == '.' && data[i + 1] == '.') || (data[start - 1] == '.' && data[i] == '.'))) { + continue; + } + + return new String(data, start, i - start + 1); + } + throw new IllegalArgumentException("Did not find a phone number!"); + } + + @Override + public String toString() { + return this.originalPhoneNumber; + } + + @Override + public int compareTo(PhoneNumber o) { + return this.toNormalizedString().compareTo(o.toNormalizedString()); + } + + @Override + public boolean equals(Object o) { + if (o instanceof PhoneNumber) { + /** + * Consider phone numbers equal if they have the same normalized form. + */ + return this.toNormalizedString().equals(((PhoneNumber) o).toNormalizedString()); + } else { + return false; + } + } + + @Override + public int hashCode() { + return this.toNormalizedString().hashCode(); + } + + /** + * Test if character is a digit. + * + * @param d + * the character to test + * @return whether or not the character is a digit + */ + private static boolean isDigit(char d) { + if (DIGITS.indexOf(d) >= 0) { + return true; + } + return false; + } + + /** + * Tests if character is a phone number character (digits, spaces, parens, dash, plus, dot). + * + * @param c + * the character to test + * @return whether or not the character is a phone number character + */ + private static boolean isPhoneNumberCharacter(char c) { + return ((isDigit(c) || c == ' ' || c == '(' || c == ')' || c == '-' || c == '+' || c == '.')); + } + + /** + * Count the number of digits in a character array. + * + * @param data + * The character array to count digits in + * @param start + * The start offset for the character array + * @param end + * The end offset for the character array + * @return The number of digits in the character array in positions [start, end] + */ + private static int countDigits(char data[], int start, int end) { + int count = 0; + char c; + for (int i = start; i <= end; i++) { + c = data[i]; + if (isDigit(c)) { + count++; + } + } + return count; + } + + /** + * Test if string is an ISBN. + * + * @param s + * The string to test + * @return whether or not the string is an ISBN + */ + private static boolean isISBN(String s) { + if (s.length() == 10) { + int sum = 0; + for (int i = 0; i < s.length(); i++) { + if (isDigit(s.charAt(i))) { + sum += (Character.digit(s.charAt(i), 10) * (10 - i)); + } else if (s.charAt(i) == 'X' && i == 9) { + sum += 10; + } + } + return sum % 11 == 0; + } else if (s.length() == 13 && (s.startsWith("978") || s.startsWith("979"))) { + int sum = 0; + for (int i = 0; i < s.length(); i++) { + sum += (Character.digit(s.charAt(i), 10) * (i % 2 == 0 ? 1 : 3)); + } + return sum % 10 == 0; + } + return false; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/util/Point.java b/core/utils/type-utils/src/main/java/datawave/data/type/util/Point.java new file mode 100644 index 00000000000..2bfc16e47db --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/data/type/util/Point.java @@ -0,0 +1,17 @@ +package datawave.data.type.util; + +import java.io.Serializable; + +/** + * This class operates as a delegate for JTS Point instances. + */ +public class Point extends AbstractGeometry implements Comparable, Serializable { + public Point(org.locationtech.jts.geom.Point jtsGeom) { + super(jtsGeom); + } + + @Override + public int compareTo(Point o) { + return jtsGeom.compareTo(o.jtsGeom); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/query/parser/JavaRegexAnalyzer.java b/core/utils/type-utils/src/main/java/datawave/query/parser/JavaRegexAnalyzer.java new file mode 100644 index 00000000000..c3cd92731e9 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/query/parser/JavaRegexAnalyzer.java @@ -0,0 +1,1228 @@ +package datawave.query.parser; + +import java.text.ParseException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.log4j.Logger; + +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; + +/** + * A class used to analyze and manipulate regular expressions + * + * TODO: if somebody finds a usable java Pattern grammar, please rewrite this class + */ +public class JavaRegexAnalyzer { + protected static final Logger log = Logger.getLogger(JavaRegexAnalyzer.class); + + // Types as applied to portions of the regex. We are interested in portions that + // are literals and those that contain regex constructs. + private enum RegexType { + LITERAL(true), // a literal value + ESCAPED_LITERAL(true), // an escaped literal (e.g. \[ or \.) + REGEX(false), // a regex + REGEX_QUANTIFIER(false), // a regex quantifier like * or + + ESCAPED_REGEX(false), // an escaped regex construct + IGNORABLE_REGEX(false); // an ignorable regex construct (e.g. boundary or quoting) + + private boolean literal = false; + + private RegexType(boolean lit) { + this.literal = lit; + } + + public boolean isLiteral() { + return this.literal; + } + } + + private static class RegexPart { + // the regex is not-final to allow applyRegexCaseSensitivity + public String regex; + public RegexType type; + public final boolean nonCapturing; + + public RegexPart(String reg, RegexType typ, boolean nonCapt) { + this.regex = reg; + this.type = typ; + this.nonCapturing = nonCapt; + } + + public RegexPart(String reg, RegexType typ, int nonCapt) { + this(reg, typ, (nonCapt > 0)); + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof RegexPart)) { + return false; + } + RegexPart other = (RegexPart) o; + return regex.equals(other.regex) && type.equals(other.type) && (nonCapturing == other.nonCapturing); + } + + @Override + public int hashCode() { + return regex.hashCode() + type.hashCode() + (nonCapturing ? 1 : 0); + } + + @Override + public String toString() { + return regex; + } + } + + public static class JavaRegexParseException extends ParseException { + private static final long serialVersionUID = -8377431598528407124L; + + public JavaRegexParseException(String s, int errorOffset) { + super(s, errorOffset); + } + } + + // The regex broken into parts + private RegexPart[] regexParts = null; + + // The updated value portion + private String leadingLiteral = null; + private String trailingLiteral = null; + private boolean updatedLiterals = false; + + // do we have a capturing regex somewhere + private boolean hasWildCard = false; + + // the characters that when escaped have special meanings (i.e. not an escaped literal value) + private static final String ESCAPED_REGEX_CHARS = "0123456789xutnrfaecdDsSwWpPbBAGzZQE"; + + // Non digit matching regex chars + private static final String NON_DIGIT_ESCAPED_REGEX_CHARS = "tnrfaecDsw"; + + // Quoting regex chars + private static final String QUOTING_REGEX_CHARS = "QE"; + + // Boundary regex chars + private static final String BOUNDARY_REGEX_CHARS = "bBAGzZ"; + + // Boundary chars + private static final String BOUNDARY_CHARS = "^$"; + + // digit character classes + private static final List DIGIT_CHARACTER_CLASSES = Arrays.asList("\\P{Lower}", "\\P{Upper}", "\\p{ASCII}", "\\P{Alpha}", "\\p{Digit}", + "\\p{Alnum}", "\\P{{Punct}", "\\p{Graph}", "\\p{Print}", "\\P{Blank}", "\\P{Cntrl}", "\\p{XDigit}", "\\P{Space}", "\\P{javaLowerCase}", + "\\P{javaUpperCase}", "\\P{javaWhitespace}", "\\P{javaMirrored}", "\\P{InGreek}", "\\P{Lu}", "\\P{Sc}", "\\p{L}"); + + // the character class chars + private static final String CHAR_REGEX_CHARS = "0xutnrfaecdDsSwWpP"; + + // the back reference chars + private static final String BACK_REF_CHARS = "123456789"; + + // characters that are have special meanings + private static final String RESERVED_CHARS = ".*?+{}^$|()[]"; + + // Some pattern precompiling + private static final String FLAG_REGEX = "\\(\\?-?[idmsux]\\).*"; + private static final Pattern flagRegexPattern = Pattern.compile(FLAG_REGEX); + private static final String NON_CAPTURING_REGEX = "\\(\\?[idmsux:=!>(<[=!]-)].*"; + private static final Pattern nonCapturingPattern = Pattern.compile(NON_CAPTURING_REGEX); + private static final String CURLY_QUANTIFIER_REGEX = "\\{([0-9]+)(,([0-9]*))?\\}.*"; + private static final Pattern curlyQuantifierPattern = Pattern.compile(CURLY_QUANTIFIER_REGEX); + + // characters that serve as quantifiers + private static final String QUANTIFIERS = "*+?"; + + private static int MIN_INDEX = 0; + private static int MAX_INDEX = 1; + + private static final RegexPart OPEN_PAREN = new RegexPart("(", RegexType.REGEX, false); + private static final RegexPart CLOSE_PAREN = new RegexPart(")", RegexType.REGEX, false); + private static final RegexPart ALTERNATE = new RegexPart("|", RegexType.REGEX, false); + + // construct a regex analyzer + public JavaRegexAnalyzer(String regex) throws JavaRegexParseException { + setRegex(regex); + } + + public String getRegex() { + return getRegex(regexParts); + } + + public static String getRegex(RegexPart[] regexParts) { + StringBuilder regex = new StringBuilder(); + for (RegexPart part : regexParts) { + regex.append(part.regex); + } + return regex.toString(); + } + + @Override + public String toString() { + return getRegex(); + } + + /** + * Set the regex on this analyzer. This will do the parsing of the regex into its parts up front. Note that this parser only needs to parse enough for the + * purposes of the applyRegexCaseSensitivity and the determination of the leading and trailing literals. + * + * @param regex + */ + public void setRegex(String regex) throws JavaRegexParseException { + regexParts = null; + List partList = new ArrayList<>(); + + // parse on '\' characters, + // then walk forward from each one to determine the escaped character or character class, parsing on '[' and ']' characters + // as we go + // note that things between a \Q and \E do not count as escaped character or character classes. + String[] parts = Iterables.toArray(Splitter.on('\\').split(regex), String.class); + + // is the next section/part escaped + boolean escaped = false; + + // are we in a quoted section (between \\Q and \\E) + boolean quoted = false; + + // keeping track of paren and bracket nesting + LinkedList parensAndBrackets = new LinkedList<>(); + + // remember if we are inside any brackets to enable distinguishing between LITERAL and REGEX + int bracketCount = 0; + + // are we in a non-capturing group: we want to hold these as separate entities as they can be ignored with determining the updated value + int nonCapturing = 0; + + // expression is the portion prefaced with a '\\' + String expression = null; + + // remainder is non-escaped portion + String remainder = null; + + // keep track of the column for exceptions + int column = 0; + + // for each part + for (int i = 0; i < parts.length; i++) { + // if not an escaped portion, then the entire part is the remainder and the next part is escaped + if (!escaped) { + remainder = parts[i]; + escaped = true; + } + // if in a quoted section, then end quoting if we find \\E + else if (quoted) { + if (parts[i].startsWith("E")) { + quoted = false; + expression = "E"; + remainder = parts[i].substring(1); + } else { + expression = ""; + remainder = parts[i]; + } + } + // else in an escaped, non-quoted section + else { + // endExpression is the division between the escaped character/class and the remainder + int endExpression = 0; + + // check for \\\\ + if (parts[i].equals("")) { + parts[i] = "\\"; + endExpression = 1; + // In this case the next section is not escaped + escaped = false; + } + // check for a \\p{class} construct + else if (parts[i].startsWith("p{")) { + endExpression = parts[i].indexOf('}'); + if (endExpression < 0) { + throw new JavaRegexParseException("Invalid Regular Expression: Found a \\p{... without and end } character: " + parts[i], column); + } else { + endExpression++; + } + } + // check for a \\P{class} construct + else if (parts[i].startsWith("P{")) { + endExpression = parts[i].indexOf('}'); + if (endExpression < 0) { + throw new JavaRegexParseException("Invalid Regular Expression: Found a \\P{... without and end } character: " + parts[i], column); + } else { + endExpression++; + } + } + // check for a \\cX construct + else if (parts[i].startsWith("c")) { + if (parts[i].length() == 1) { + throw new JavaRegexParseException("Invalid Regular Expression: Found a \\cX without the X character: " + parts[i], column); + } + endExpression = 2; + } + // check for a \\0 (back reference), or \\0n or \\0nn or \\0mnn (octal character) + else if (parts[i].startsWith("0")) { + int maxExpression = Math.min(4, parts[i].length()); + for (endExpression = 1; endExpression < maxExpression; endExpression++) { + try { + if (Integer.parseInt(parts[i].substring(1, endExpression + 1), 8) > 255) { + break; + } + } catch (Exception e) { + break; + } + } + } + // check for \\xhh + else if (parts[i].startsWith("x")) { + if (parts[i].length() < 3) { + throw new JavaRegexParseException("Invalid Regular Expression: Found a \\xhh without the hh characters: " + parts[i], column); + } + endExpression = 3; + try { + Integer.parseInt(parts[i].substring(1, endExpression), 16); + } catch (Exception e) { + throw new JavaRegexParseException("Invalid Regular Expression: Found a \\xhh without the hh characters: " + parts[i], column); + } + } + // check for \\uhhhh + else if (parts[i].startsWith("u")) { + if (parts[i].length() < 5) { + throw new JavaRegexParseException("Invalid Regular Expression: Found a \\uhhhh without the hhhh characters: " + parts[i], column); + } + endExpression = 5; + try { + Integer.parseInt(parts[i].substring(1, endExpression), 16); + } catch (Exception e) { + throw new JavaRegexParseException("Invalid Regular Expression: Found a \\uhhhh without the hh characters: " + parts[i], column); + } + } + // assume \\? + else { + endExpression = 1; + } + + // now pull off the expression and remainder + if (endExpression == 0) { + remainder = parts[i]; + } else if (endExpression < parts[i].length()) { + expression = parts[i].substring(0, endExpression); + remainder = parts[i].substring(endExpression); + } else { + expression = parts[i]; + } + } + + if (expression != null) { + RegexType type = RegexType.ESCAPED_REGEX; + + // determine if this is an escaped regex or an escaped literal + if (expression.length() == 1) { + if (ESCAPED_REGEX_CHARS.indexOf(expression.charAt(0)) < 0) { + // if we are in a bracket, then its a regex + type = (bracketCount > 0 ? RegexType.REGEX : RegexType.ESCAPED_LITERAL); + } + // check for quoting chars + else if (QUOTING_REGEX_CHARS.indexOf(expression.charAt(0)) >= 0) { + if (expression.equals("Q")) { + quoted = true; + } + type = RegexType.IGNORABLE_REGEX; + } + // check for boundary chars + else if (BOUNDARY_REGEX_CHARS.indexOf(expression.charAt(0)) >= 0) { + type = RegexType.IGNORABLE_REGEX; + } + } + + partList.add(new RegexPart("\\" + expression, type, nonCapturing)); + column += expression.length() + 1; + + expression = null; + } + + if (remainder != null) { + if (quoted) { + // if we are in a bracket, then its a regex + RegexType type = (bracketCount > 0 ? RegexType.REGEX : RegexType.LITERAL); + partList.add(new RegexPart(remainder, type, nonCapturing)); + column += remainder.length(); + } else { + // check for () or [] constructs + for (int c = 0; c < remainder.length(); c++) { + char character = remainder.charAt(c); + + if (RESERVED_CHARS.indexOf(character) >= 0) { + if (character == '(') { + // look for a non-capturing group + if (c < remainder.length() - 1 && remainder.charAt(c + 1) == '?') { + nonCapturing++; + String value = remainder.substring(c); + // look for a flag + if (flagRegexPattern.matcher(value).matches()) { + int len = (value.charAt(2) == '-' ? 5 : 4); + value = value.substring(0, len); + partList.add(new RegexPart(value, RegexType.REGEX, nonCapturing)); + column += value.length(); + nonCapturing--; + c += len - 1; + } else if (nonCapturingPattern.matcher(value).matches()) { + int len = (value.charAt(2) == '<' ? 4 : 3); + value = value.substring(0, len); + parensAndBrackets.addLast(value); + partList.add(new RegexPart(value, RegexType.REGEX, nonCapturing)); + column += value.length(); + c += len - 1; + } else { + throw new JavaRegexParseException( + "Invalid Regular Expression: does not match a known non-capturing group construct: " + value, column); + } + } else { + parensAndBrackets.addLast("("); + // forcing this to non-capturing as it is really not part of the literal value contained therein + partList.add(new RegexPart("(", RegexType.REGEX, nonCapturing)); + column += 1; + } + } else if (character == '[') { + bracketCount++; + if (c < remainder.length() - 1 && remainder.charAt(c + 1) == '^') { + parensAndBrackets.addLast("[^"); + partList.add(new RegexPart("[^", RegexType.REGEX, nonCapturing)); + column += 2; + c++; + } else { + parensAndBrackets.addLast("["); + partList.add(new RegexPart("[", RegexType.REGEX, nonCapturing)); + column += 1; + } + } else if (character == '{') { + String value = remainder.substring(c); + // look for a full {n} or {n,m} + if (curlyQuantifierPattern.matcher(value).matches()) { + int len = value.indexOf('}') + 1; + value = value.substring(0, len); + partList.add(new RegexPart(value, RegexType.REGEX_QUANTIFIER, nonCapturing)); + column += value.length(); + c += len - 1; + } else { + throw new JavaRegexParseException("Found a {... but expected {n} or {n,} or {n,m}: " + value, column); + } + } else if (character == ')') { + // forcing this to non-capturing as it is really not part of the literal value contained therein + partList.add(new RegexPart(")", RegexType.REGEX, nonCapturing)); + column += 1; + String closing = parensAndBrackets.removeLast(); + if (closing.charAt(0) != '(') { + throw new JavaRegexParseException("Invalid Regular Expression: unexpected closing paren", column); + } + if (closing.length() > 1) { + nonCapturing--; + } + } else if (character == ']') { + partList.add(new RegexPart("]", RegexType.REGEX, nonCapturing)); + column += 1; + String closing = parensAndBrackets.removeLast(); + if (closing.charAt(0) != '[') { + throw new JavaRegexParseException("Invalid Regular Expression: unexpected closing square bracket", column); + } + bracketCount--; + } else if (character == '}') { + partList.add(new RegexPart("}", RegexType.REGEX, nonCapturing)); + column += 1; + String closing = parensAndBrackets.removeLast(); + if (closing.charAt(0) != '{') { + throw new JavaRegexParseException("Invalid Regular Expression: unexpected closing curly bracket", column); + } + } else if (QUANTIFIERS.indexOf(character) >= 0) { + partList.add(new RegexPart(Character.toString(character), RegexType.REGEX_QUANTIFIER, nonCapturing)); + column += 1; + } else if (BOUNDARY_CHARS.indexOf(character) >= 0) { + partList.add(new RegexPart(Character.toString(character), RegexType.IGNORABLE_REGEX, nonCapturing)); + column += 1; + } else { + partList.add(new RegexPart(Character.toString(character), RegexType.REGEX, nonCapturing)); + column += 1; + } + } else if (bracketCount > 0 && character == '&' && c < remainder.length() - 1 && remainder.charAt(c + 1) == '&') { + // this is a special case in a character class contruct + partList.add(new RegexPart("&&", RegexType.REGEX, nonCapturing)); + column += 2; + c++; + } else { + partList.add(new RegexPart(Character.toString(character), RegexType.LITERAL, nonCapturing)); + column += 1; + } + } + } + remainder = null; + } + } + if (!parensAndBrackets.isEmpty()) { + throw new JavaRegexParseException("Invalid Regular Expression: missing closing paren or bracket", column); + } + if (quoted) { + throw new JavaRegexParseException("Invalid Regular Expression: missing closing quoted section (\\E)", column); + } + regexParts = partList.toArray(new RegexPart[partList.size()]); + } + + /** + * Determine the leading and trailing literals, and update the hasWildCard boolean while we are at it + */ + private void updateLiteral() { + if (!updatedLiterals) { + updateLeadingLiteralAndWildCard(); + updateTrailingLiteral(); + updatedLiterals = true; + } + } + + private void updateLeadingLiteralAndWildCard() { + leadingLiteral = null; + hasWildCard = false; + + // a stack of literal builders used for nested capturing groups. If empty then we are at the top level. + LinkedList literalBuilders = new LinkedList<>(); + // the current literal builder + StringBuilder literalBuilder = new StringBuilder(); + // appendLiteral is set false once we have found a regex and we need to terminate with what we have + boolean appendLiteral = true; + + for (int i = 0; i < regexParts.length; i++) { + RegexPart part = regexParts[i]; + + // if we are done and we have already resolved all nestings, then we are done + if (!appendLiteral && literalBuilders.isEmpty()) { + break; + } + + // simply ignore nonCapturing portions + if (part.nonCapturing) { + continue; + } + + // ignore the ignorable + if (part.type.equals(RegexType.IGNORABLE_REGEX)) { + continue; + } + + // if a literal then append to the current builder + if (part.type.isLiteral()) { + if (appendLiteral && atLeastOnce(i)) { + if (part.type == RegexType.ESCAPED_LITERAL) { + literalBuilder.append(part.regex.substring(1)); + } else { + literalBuilder.append(part.regex); + } + } + } + // if a capturing group, the push the literal builders + else if (part.regex.equals("(")) { + literalBuilders.addLast(literalBuilder); + literalBuilder = new StringBuilder(); + } + // if ending a capturing group, then pop the literal builders, appending as appropriate + else if (part.regex.equals(")")) { + if (atLeastOnce(i)) { + literalBuilders.getLast().append(literalBuilder); + } + literalBuilder = literalBuilders.removeLast(); + } else { + // if a logical OR, then empty the literal we are working on + if (part.regex.equals("|")) { + literalBuilder.setLength(0); + } + + // we are now done appending literals as we have found a non-literal + appendLiteral = false; + + // we can set the hasWildCard to true now + hasWildCard = true; + } + } + if (literalBuilder.length() > 0) { + leadingLiteral = literalBuilder.toString(); + } + } + + private void updateTrailingLiteral() { + trailingLiteral = null; + + // appendLiteral is set false once we have found a regex and we need to terminate with what we have + boolean appendLiteral = true; + + // a stack of literal builders used for nested capturing groups. If empty then we are at the top level. + LinkedList literalBuilders = new LinkedList<>(); + // the current literal builder + StringBuilder literalBuilder = new StringBuilder(); + // a stack of atLeastOnce flags which tracks with the literalBuilders stack + LinkedList atLeastOnceFlags = new LinkedList<>(); + // the current atLeastOnce flag + boolean atLeastOnce = true; + + // have we found a quantifier yet + boolean quantifierFound = false; + + for (int i = regexParts.length - 1; i >= 0; i--) { + RegexPart part = regexParts[i]; + + // if we are done and we have already resolved all nestings, then we are done + if (!appendLiteral && literalBuilders.isEmpty()) { + break; + } + + // simply ignore nonCapturing portions + if (part.nonCapturing) { + continue; + } + + // ignore the ignorable + if (part.type.equals(RegexType.IGNORABLE_REGEX)) { + continue; + } + + // ignore quantifiers + if (part.type == RegexType.REGEX_QUANTIFIER) { + // if we may have none of the preceding value, then we are done + if (!atLeastOnce(i - 1)) { + appendLiteral = false; + } + quantifierFound = true; + continue; + } + + // if a literal then prepend to the current builder + if (part.type.isLiteral()) { + if (appendLiteral) { + if (part.type == RegexType.ESCAPED_LITERAL) { + literalBuilder.insert(0, part.regex.substring(1)); + } else { + literalBuilder.insert(0, part.regex); + } + // if a quantifier was found at the top level, then we are now done + if (quantifierFound && literalBuilders.isEmpty()) { + appendLiteral = false; + } + } + } + // if a capturing group, the push the literal builders + else if (part.regex.equals(")")) { + literalBuilders.addLast(literalBuilder); + atLeastOnceFlags.addLast(atLeastOnce); + literalBuilder = new StringBuilder(); + atLeastOnce = atLeastOnce(i); + } + // if ending a capturing group, then pop the literal builders, appending as appropriate + else if (part.regex.equals("(")) { + if (atLeastOnce) { + literalBuilders.getLast().insert(0, literalBuilder); + } + literalBuilder = literalBuilders.removeLast(); + atLeastOnce = atLeastOnceFlags.removeLast(); + + // if a quantifier was found, then we are done + if (quantifierFound) { + appendLiteral = false; + } + } + // else some other regex + else { + // if a logical OR, then empty the literal we are working on + if (part.regex.equals("|")) { + literalBuilder.setLength(0); + } + + // we are now done appending literals as we have found a non-literal + appendLiteral = false; + } + } + if (literalBuilder.length() > 0) { + trailingLiteral = literalBuilder.toString(); + } + } + + /** + * Determine if the part at index i is to occur at least once as determined by an optional following regex quantifier + * + * @param i + */ + private boolean atLeastOnce(int i) { + // only use this literal if the following part is not ?, *, or {0,... + boolean atLeastOnce = true; + if (followedByQuantifier(i)) { + if (regexParts[i + 1].regex.equals("?") || regexParts[i + 1].regex.equals("*")) { + atLeastOnce = false; + } else if (regexParts[i + 1].regex.equals("{0}") || regexParts[i + 1].regex.startsWith("{0,")) { + atLeastOnce = false; + } + } + return atLeastOnce; + } + + /** + * Determine if the part at index i is followed by a quantifier + * + * @param i + */ + private boolean followedByQuantifier(int i) { + return (i < (regexParts.length - 1) && regexParts[i + 1].type == RegexType.REGEX_QUANTIFIER); + } + + public boolean hasWildCard() { + updateLiteral(); + return hasWildCard; + } + + public boolean isLeadingLiteral() { + updateLiteral(); + return leadingLiteral != null; + } + + public boolean isTrailingLiteral() { + updateLiteral(); + return trailingLiteral != null; + } + + public boolean isLeadingRegex() { + updateLiteral(); + return leadingLiteral == null; + } + + public boolean isTrailingRegex() { + updateLiteral(); + return trailingLiteral == null; + } + + public boolean isNgram() { + updateLiteral(); + return (leadingLiteral == null && trailingLiteral == null); + } + + public String getLeadingLiteral() { + updateLiteral(); + return leadingLiteral; + } + + public String getTrailingLiteral() { + updateLiteral(); + return trailingLiteral; + } + + public String getLeadingOrTrailingLiteral() { + updateLiteral(); + return (leadingLiteral != null ? leadingLiteral : trailingLiteral); + } + + /** + * Given an ip regex, zero pad it out to create a regex for the normalized ip value. + * + * This method does not attempt to discern the intent of the user when a wildcard is specified mid-octet. It always tries to zero-pad the octet in which the + * wildcard was found + * + * For example, 1.2.1* has the potential to match 001.002.001.*, 001.002.010.*, or 001.002.100.*. This method will return an expansion of 001.002.001.* for + * that input. + * + * @return If the zero-padded variant consists of octets of length 3, the zero-padded regex variant. Else, the original ip address. + * @throws JavaRegexParseException + */ + public String getZeroPadIpRegex() throws JavaRegexParseException { + StringBuilder builder = new StringBuilder(); + + RegexPart split = new RegexPart("\\.", RegexType.ESCAPED_LITERAL, false); + + // split up the parts into those that would match against a tuple + // to do that we find the literal '.' matches + List tuples = splitParts(this.regexParts, split); + + List ignore = Arrays.asList(split, ALTERNATE, OPEN_PAREN, CLOSE_PAREN); + + // if we found a tuple that crosses over an open group or a close group, then we have a situation + // we cannot handle currently. This gets even more complicates with alternatives within the groups. + // (e.g. \\.m(n\\.o)p\\. in which case the tuples are actually mn and op) + boolean inTuple = false; + String previousTuple = null; + + // now for each tuple, prefix with '0' literals as needed + for (RegexPart[] tuple : tuples) { + if (tuple.length != 1 || !ignore.contains(tuple[0])) { + if (inTuple) { + throw new JavaRegexParseException( + "Currently cannot handle tuples that cross over group boundaries: " + previousTuple + " and " + getRegex(tuple), -1); + } + inTuple = true; + previousTuple = getRegex(tuple); + + if (!allDigits(tuple)) { + throw new JavaRegexParseException("This tuple matches non digits and hence cannot match an IPV4: " + previousTuple, -1); + } + + // now prefix with '0' literals from 3-min to 3-max + int[] bounds = countMatchedChars(tuple); + if (bounds[MIN_INDEX] < 3) { + // verify that the characters matched would actually be digits + if (bounds[MIN_INDEX] == bounds[MAX_INDEX]) { + int count = 3 - bounds[MIN_INDEX]; + for (int i = 0; i < count; i++) { + builder.append('0'); + } + } else { + int lower = 3 - bounds[MAX_INDEX]; + if (lower < 0) { + lower = 0; + } + int upper = 3 - bounds[MIN_INDEX]; + builder.append("0{").append(lower).append(',').append(upper).append('}'); + } + } + } else if (tuple[0].equals(split)) { + inTuple = false; + } else if (tuple[0].equals(ALTERNATE)) { + inTuple = false; + } + builder.append(getRegex(tuple)); + } + return builder.toString(); + } + + private boolean allDigits(RegexPart[] tuple) { + LinkedList negatedCharClass = new LinkedList<>(); + boolean negated = false; + for (RegexPart part : tuple) { + switch (part.type) { + case LITERAL: + for (int i = 0; i < part.regex.length(); i++) { + if (negated == Character.isDigit(part.regex.charAt(i))) { + return false; + } + } + break; + case ESCAPED_LITERAL: + if (negated == Character.isDigit(part.regex.charAt(1))) { + return false; + } + break; + case REGEX: + if (part.regex.equals("[")) { + negatedCharClass.addLast(Boolean.valueOf(negated)); + } else if (part.regex.equals("[^")) { + negatedCharClass.addLast(Boolean.valueOf(negated)); + negated = !negated; + } else if (part.regex.equals("]")) { + negated = negatedCharClass.removeLast(); + } + break; + case ESCAPED_REGEX: { + if (part.regex.charAt(1) == 'p' || part.regex.charAt(1) == 'P') { + if (negated == DIGIT_CHARACTER_CLASSES.contains(part.regex)) { + return false; + } + } else if (negated == (NON_DIGIT_ESCAPED_REGEX_CHARS.indexOf(part.regex.charAt(1)) < 0)) { + return false; + } + } + } + } + return true; + } + + /** + * Split up a list of parts using a specified separator. If a separator is found inside of a nested group, then that group and its ancestors begin and end + * parentheses will be returned as separate parts. Separators are included as separate parts. + * + * @param character + * @param escaped + * @return the part lists + * @throws JavaRegexParseException + */ + public List splitParts(char character, boolean escaped) throws JavaRegexParseException { + // create the part + RegexType type = (escaped ? RegexType.ESCAPED_LITERAL : RegexType.LITERAL); + if (escaped && (ESCAPED_REGEX_CHARS.indexOf(character) >= 0)) { + if (QUOTING_REGEX_CHARS.indexOf(character) >= 0 || BOUNDARY_REGEX_CHARS.indexOf(character) >= 0) { + type = RegexType.IGNORABLE_REGEX; + } else { + type = RegexType.ESCAPED_REGEX; + } + } else if (!escaped && (RESERVED_CHARS.indexOf(character) >= 0)) { + if (BOUNDARY_CHARS.indexOf(character) >= 0) { + type = RegexType.IGNORABLE_REGEX; + } else { + type = RegexType.REGEX; + } + } + List parts = splitParts(this.regexParts, new RegexPart(Character.toString(character), type, false)); + List regex = new ArrayList<>(); + for (RegexPart[] part : parts) { + regex.add(getRegex(part)); + } + return regex; + } + + /** + * Split up a list of parts using a specified separator. If a separator is found inside of a nested group, then that group and its ancestors begin and end + * parentheses will be returned as separate parts as well as alternates. Separators are included as separate parts. + * + * @param parts + * @param separator + * @return the part lists + * @throws JavaRegexParseException + */ + private List splitParts(RegexPart[] parts, RegexPart separator) throws JavaRegexParseException { + LinkedList tuples = new LinkedList<>(); + int start = 0; + int level = 0; + int separationLevel = 0; + for (int i = 0; i < parts.length; i++) { + if (parts[i].equals(OPEN_PAREN)) { + level++; + } else if (parts[i].equals(CLOSE_PAREN)) { + if (level == 0) { + throw new JavaRegexParseException("Non matching groups", -1); + } + level--; + if (level < separationLevel) { + if (start < i) { + tuples.addLast(Arrays.copyOfRange(parts, start, i)); + } + tuples.addLast(new RegexPart[] {CLOSE_PAREN}); + start = i + 1; + // now move the separation level up one + separationLevel = level; + } + } else if (parts[i].equals(separator)) { + // split up appropriately accounting for levels and alternates + separationLevel = level; + int begin = start; + for (int j = start; j < i; j++) { + if (parts[j].equals(ALTERNATE)) { + if (start < j) { + tuples.addLast(Arrays.copyOfRange(parts, start, j)); + } + tuples.addLast(new RegexPart[] {ALTERNATE}); + start = j + 1; + } else if (parts[j].equals(OPEN_PAREN)) { + if (start < j) { + tuples.addLast(Arrays.copyOfRange(parts, start, j)); + } + tuples.addLast(new RegexPart[] {OPEN_PAREN}); + start = j + 1; + } else if (parts[j].equals(CLOSE_PAREN)) { + // we found a closing paren which must have a matching open paren within a non-separated section + + // move back to the matching paren + RegexPart[] last = tuples.removeLast(); + while (last.length != 1 || !last[0].equals(OPEN_PAREN)) { + start -= last.length; + last = tuples.removeLast(); + } + start -= last.length; + last = (tuples.isEmpty() || start == begin ? null : tuples.removeLast()); + + // now move back to the previous paren, separator, or beginning + while (last != null && (last.length != 1 || !(last[0].equals(OPEN_PAREN) || last[0].equals(separator)))) { + start -= last.length; + last = (tuples.isEmpty() || start == begin ? null : tuples.removeLast()); + } + if (last != null) { + tuples.addLast(last); + } + } + } + + if (i > start) { + tuples.addLast(Arrays.copyOfRange(parts, start, i)); + } + tuples.addLast(new RegexPart[] {separator}); + start = i + 1; + } + } + if (level > 0) { + throw new JavaRegexParseException("Non matching groups", -1); + } + if (parts.length > start) { + tuples.addLast(Arrays.copyOfRange(parts, start, parts.length)); + } + + return tuples; + } + + /** + * Determine the minimum and maximum number of characters that a regex will match + * + * @return the min and max number of characters that the regex will match + * @throws JavaRegexParseException + */ + public int[] countMatchedChars() throws JavaRegexParseException { + return countMatchedChars(this.regexParts); + } + + /** + * Determine the minimum and maximum number of characters that a regex will match + * + * @param parts + * @return the min and max number of characters that the regex will match + * @throws JavaRegexParseException + */ + private int[] countMatchedChars(RegexPart[] parts) throws JavaRegexParseException { + + // a stack of ranges used for nested capturing groups with alternates. If empty then we are at the top level. + LinkedList> groups = new LinkedList<>(); + + // the current alternates. If empty then we have no alternates + LinkedList alternates = new LinkedList<>(); + + // the current bounds + int[] bounds = new int[2]; + + // are we in a character class section [...] + int charClass = 0; + + // now count the digits + int column = 0; + int len = parts.length; + for (int partIndex = 0; partIndex < len; partIndex++) { + RegexPart part = parts[partIndex]; + if (part.nonCapturing) { + continue; + } + if (charClass > 0) { + if (part.type == RegexType.REGEX) { + if (part.regex.equals("]")) { + charClass--; + if (charClass == 0) { + updateBounds(bounds, 1, parts, partIndex); + } + } else if (part.regex.startsWith("[")) { + charClass++; + } + } + } else { + switch (part.type) { + case LITERAL: + updateBounds(bounds, part.regex.length(), parts, partIndex); + break; + case ESCAPED_LITERAL: + updateBounds(bounds, part.regex.length() - 1, parts, partIndex); + break; + case ESCAPED_REGEX: + if (CHAR_REGEX_CHARS.indexOf(part.regex.charAt(1)) >= 0) { + updateBounds(bounds, 1, parts, partIndex); + } else if (BACK_REF_CHARS.indexOf(part.regex.charAt(1)) >= 0) { + // unsupported + throw new JavaRegexParseException("Cannot deal with back references in zeroPadRegex", column); + } + break; + case REGEX: + if (part.regex.equals("(")) { + alternates.addLast(bounds); + groups.addLast(alternates); + bounds = new int[2]; + alternates = new LinkedList<>(); + } else if (part.regex.equals(")")) { + if (alternates.isEmpty()) { + throw new JavaRegexParseException("Found an illegal close ')' to a group without the open '('", column); + } + alternates.addLast(bounds); + int[] update = summarize(alternates); + alternates = groups.removeLast(); + bounds = alternates.removeLast(); + updateBounds(bounds, update, parts, partIndex); + } else if (part.regex.startsWith("[")) { + charClass++; + } else if (part.regex.equals("]")) { + // this would have been handled above... + throw new JavaRegexParseException("Found an illegal close ']' to a character class without the open '['", column); + } else if (part.regex.equals("|")) { + alternates.addLast(bounds); + bounds = new int[2]; + } else if (part.regex.equals(".")) { + updateBounds(bounds, 1, parts, partIndex); + } else { + // unsupported + throw new JavaRegexParseException("Cannot deal with " + part.regex + " in zeroPadRegex", column); + } + break; + case REGEX_QUANTIFIER: + // already handled in updateBounds....skip + break; + case IGNORABLE_REGEX: + // ignore the ignorable + break; + } + } + } + alternates.addLast(bounds); + return summarize(alternates); + } + + /** + * Summarize a list of alternatives returning the min of the mins and the max of the maxes + * + * @param alternates + * @return the summarized bounds + */ + private static int[] summarize(LinkedList alternates) { + int[] bounds = alternates.removeLast(); + while (!alternates.isEmpty()) { + int[] alternate = alternates.removeLast(); + bounds[MIN_INDEX] = Math.min(bounds[MIN_INDEX], alternate[MIN_INDEX]); + bounds[MAX_INDEX] = Math.max(bounds[MAX_INDEX], alternate[MAX_INDEX]); + } + return bounds; + } + + /** + * Updated the bounds with a quantity, taking a following quantifier into account. + * + * @param bounds + * The bounds to update + * @param quantity + * The quantity to update both the min and max with + * @param parts + * The regex parts + * @param partIndex + * The current regex pointer + * @throws JavaRegexParseException + */ + private void updateBounds(int[] bounds, int quantity, RegexPart[] parts, int partIndex) throws JavaRegexParseException { + updateBounds(bounds, new int[] {quantity, quantity}, parts, partIndex); + } + + /** + * Updated the bounds with a min and max quantity, taking a following quantifier into account. + * + * @param bounds + * The bounds to update + * @param quantity + * The min and max quantity to update with + * @param parts + * The regex parts + * @param partIndex + * The current regex pointer + * @throws JavaRegexParseException + */ + private void updateBounds(int[] bounds, int[] quantity, RegexPart[] parts, int partIndex) throws JavaRegexParseException { + int nextIndex = partIndex + 1; + int[] multiplier = new int[] {1, 1}; + if (nextIndex < parts.length && parts[nextIndex].type == RegexType.REGEX_QUANTIFIER) { + RegexPart part = parts[nextIndex]; + // up by the max count + Matcher matcher = curlyQuantifierPattern.matcher(part.regex); + if (matcher.matches()) { + if (matcher.groupCount() == 3 && !matcher.group(3).isEmpty()) { + multiplier[MAX_INDEX] = Integer.parseInt(matcher.group(3)); + } else { + multiplier[MAX_INDEX] = Integer.parseInt(matcher.group(1)); + } + multiplier[MIN_INDEX] = Integer.parseInt(matcher.group(1)); + } else if (part.regex.equals("?")) { + multiplier[MIN_INDEX] = 0; + multiplier[MAX_INDEX] = 1; + } else if (part.regex.equals("+")) { + multiplier[MIN_INDEX] = 1; + multiplier[MAX_INDEX] = Integer.MAX_VALUE / 2; + } else if (part.regex.equals("*")) { + multiplier[MIN_INDEX] = 0; + multiplier[MAX_INDEX] = Integer.MAX_VALUE / 2; + } else { + throw new JavaRegexParseException("Cannot deal with the quantifier " + part.regex, -1); + } + } + + bounds[MIN_INDEX] += quantity[MIN_INDEX] * multiplier[MIN_INDEX]; + bounds[MAX_INDEX] += quantity[MAX_INDEX] * multiplier[MAX_INDEX]; + } + + /** + * Apply uppercase or lowercase to a regex, leaving all character class constants alone. It will replace \\p{Lu}, \\p{Lower}, \\p{Upper} \\p{javaLowerCase} + * and \\p{javaUpperCase} as well. TODO: Nested negations of upper or lower character classes are not be handled correctly. + * + * @param upperCase + */ + public void applyRegexCaseSensitivity(boolean upperCase) { + // one possibility is to simply add the case independence flag...but does not + // work for shardIndex query....maybe we can modify that logic appropriately.... + // return "(?i" + regex + ')'; + + // translate the uppercase and lowercase character classes + // and apply the upcase or lowercase to all literals + for (int i = 0; i < regexParts.length; i++) { + RegexPart part = regexParts[i]; + if (part.type.isLiteral()) { + part.regex = (upperCase ? part.regex.toUpperCase() : part.regex.toLowerCase()); + } else { + // check for \\p{Lower} or \\p{Upper} + // check for \\p{javaLowerCase} or \\p{javaUpperCase} + // check for \\p{Lu} + boolean negated = (i > 0 && regexParts[i - 1].regex.equals("[^")); + if ((upperCase != negated) && part.regex.equals("\\p{Lower}")) { + if (negated) { + regexParts[i - 1].regex = "["; + } else { + part.regex = "\\p{Upper}"; + } + } else if ((upperCase != negated) && part.regex.equals("\\p{javaLowerCase}")) { + if (negated) { + regexParts[i - 1].regex = "["; + } else { + part.regex = "\\p{javaUpperCase}"; + } + } else if ((upperCase == negated) && part.regex.equals("\\p{Upper}")) { + if (negated) { + regexParts[i - 1].regex = "["; + } else { + part.regex = "\\p{Lower}"; + } + } else if ((upperCase == negated) && part.regex.equals("\\p{javaUpperCase}")) { + if (negated) { + regexParts[i - 1].regex = "["; + } else { + part.regex = "\\p{javaLowerCase}"; + } + } else if ((upperCase == negated) && part.regex.equals("\\p{Lu}")) { + if (negated) { + regexParts[i - 1].regex = "["; + } else { + part.regex = "\\p{L}"; + } + } else if ((upperCase != negated) && part.regex.equals("\\P{Upper}")) { + if (negated) { + regexParts[i - 1].regex = "["; + } else { + part.regex = "\\p{Upper}"; + } + } else if ((upperCase != negated) && part.regex.equals("\\P{javaUpperCase}")) { + if (negated) { + regexParts[i - 1].regex = "["; + } else { + part.regex = "\\p{javaUpperCase}"; + } + } else if ((upperCase == negated) && part.regex.equals("\\P{Lower}")) { + if (negated) { + regexParts[i - 1].regex = "["; + } else { + part.regex = "\\p{Lower}"; + } + } else if ((upperCase == negated) && part.regex.equals("\\P{javaLowerCase}")) { + if (negated) { + regexParts[i - 1].regex = "["; + } else { + part.regex = "\\p{javaLowerCase}"; + } + } else if ((upperCase != negated) && part.regex.equals("\\P{Lu}")) { + if (negated) { + regexParts[i - 1].regex = "["; + } else { + part.regex = "\\p{L}"; + } + } + + } + } + + // now reset the updated value + updatedLiterals = false; + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/webservice/query/data/ObjectSizeOf.java b/core/utils/type-utils/src/main/java/datawave/webservice/query/data/ObjectSizeOf.java new file mode 100644 index 00000000000..01908d65ec9 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/webservice/query/data/ObjectSizeOf.java @@ -0,0 +1,220 @@ +package datawave.webservice.query.data; + +import java.lang.reflect.Array; +import java.lang.reflect.Field; +import java.lang.reflect.Method; +import java.lang.reflect.Modifier; +import java.util.Date; +import java.util.HashSet; +import java.util.Set; +import java.util.Stack; +import java.util.concurrent.ConcurrentHashMap; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A simple interface that objects can implement to return the object size. + */ +public interface ObjectSizeOf { + /** + * The (approximate) size of the object + */ + long sizeInBytes(); + + class ObjectInstance { + private Object o; + + public ObjectInstance(Object _o) { + this.o = _o; + } + + public Object getObject() { + return o; + } + + @Override + public int hashCode() { + return System.identityHashCode(o); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof ObjectInstance)) { + return false; + } + return ((ObjectInstance) obj).o == this.o; + } + } + + /** + * Set of commonly sized objects with precomputed static sizes. Some types have additional dynamic sizing that must still be computed. All precomputed + * static sizes include the object overhead and reference overhead + */ + class PrecomputedSizes { + /** + * static component: object ref (8) + ref to object (4) + int (4) + char[] (12 + 4) dynamic component: 2*length + */ + public static final long STRING_STATIC_REF = 32; + + /** + * All dates will be similarly sized, compute the size of one and reuse it + */ + public static final long DATE_STATIC_REF = Sizer.getObjectSize(new Date()) + Sizer.OBJECT_OVERHEAD + Sizer.REFERENCE; + + /** + * All IPV4 will be similarly sized object ref(8) + ref to object (4) + int (4) + int (4) + byte[4] (12 + 4 + 4*1) + */ + public static final long IPV4ADDRESS_STATIC_REF = 40; + + /** + * All IPV6 will be similarly sized object ref(8) + ref to object (4) + short[8] (12 + 4 + 8*2) + */ + public static final long IPV6ADDRESS_STATIC_REF = 44; + + /** + * All BigDecimal will be similarly sized object ref (8) + ref to object (4) + int (4) + int (4) + String (len=) + long (8) + BigInt( obj ref (8) + ref + * to object (4) + int (4) + int[] (len=) (12 + 4) + int (4) + int (4) + int (4) + int (4)) note: two dynamic string lengths should be negligible + */ + public static final long BIGDECIMAL_STATIC_REF = 76; + } + + class Sizer { + private static final Logger log = LoggerFactory.getLogger(Sizer.class); + public static final short OBJECT_OVERHEAD = 8; + public static final short ARRAY_OVERHEAD = 12; + public static final short REFERENCE = 4; + // The size of the basic Number constructs (and Boolean and Character) is 16: roundUp(8 + primitiveSize) + public static final short NUMBER_SIZE = 16; + + // Class name cache to avoid flood of NoSuchMethodException on reflective "sizeInBytes" invocation + private static Set noSuchMethodCache = ConcurrentHashMap.newKeySet(); + + /** + * Get the size of an object. Note that we want something relatively fast that gives us an order of magnitude here. The java Instrumentation agent + * mechanism is a little too costly for general use here. This will look for the ObjectSizeOf interface and if implemented on the object will use that. + * Otherwise it will do a simple navigation of the fields using reflection. + * + * @param o + * @return an approximation of the object size + */ + public static long getObjectSize(Object o) { + return getObjectSize(o, new HashSet(), new Stack(), true); + } + + public static long getObjectSize(Object o, Set visited, Stack stack, boolean useSizeInBytesMethod) { + long totalSize = 0; + stack.add(new ObjectInstance(o)); + while (!stack.isEmpty()) { + long size = 0; + ObjectInstance oi = stack.pop(); + o = oi.getObject(); + if (o != null && !visited.contains(oi)) { + visited.add(oi); + try { + if (useSizeInBytesMethod) { + try { + if (o instanceof ObjectSizeOf) { + size = ((ObjectSizeOf) o).sizeInBytes(); + } else { + if (!noSuchMethodCache.contains(o.getClass().getName())) { + Method sizeInBytes = o.getClass().getMethod("sizeInBytes", (Class[]) null); + size = (Long) sizeInBytes.invoke(o); + } + } + } catch (NoSuchMethodException e) { + noSuchMethodCache.add(o.getClass().getName()); + } catch (Throwable t) { + log.warn("Unexpected error invoking sizeInBytes on " + o.getClass().getName(), t); + } + } + if (size == 0) { + // the hard way... + // do not include Class related objects or reflection objects + if ((o instanceof Class) || (o instanceof ClassLoader) + || (o.getClass().getPackage() != null && o.getClass().getPackage().getName().startsWith("java.lang.reflect"))) { + size = 0; + } else if (o instanceof Number || o instanceof Boolean || o instanceof Character) { + size = NUMBER_SIZE; + } else { + // lets do a simple sizing + Class c = o.getClass(); + if (c.isArray()) { + size = ARRAY_OVERHEAD; + int length = Array.getLength(o); + if (c.getComponentType().isPrimitive()) { + size += length * getPrimitiveObjectSize(c.getComponentType()); + } else { + size += length * REFERENCE; + for (int i = 0; i < length; i++) { + Object element = Array.get(o, i); + if (element != null) { + stack.add(new ObjectInstance(element)); + } + } + } + } else { + size += OBJECT_OVERHEAD; + while (c != null) { + for (Field field : c.getDeclaredFields()) { + if (Modifier.isStatic(field.getModifiers())) { + continue; + } + if (field.getType().isPrimitive()) { + size += getPrimitiveObjectSize(field.getType()); + } else { + size += REFERENCE; + boolean accessible = field.isAccessible(); + field.setAccessible(true); + try { + Object fieldObject = field.get(o); + if (fieldObject != null) { + stack.push(new ObjectInstance(fieldObject)); + } + } catch (Exception e) { + // cannot get to field, so ignore it in this size calculation + e.printStackTrace(); + } + field.setAccessible(accessible); + } + } + c = c.getSuperclass(); + } + } + size = roundUp(size); + } + } + } catch (Throwable t) { + log.error("Unable to determine object size for " + o); + } + } + totalSize += size; + } + + return totalSize; + } + + public static long roundUp(long size) { + long extra = size % 8; + if (extra > 0) { + size = size + 8 - extra; + } + return size; + } + + public static short getPrimitiveObjectSize(Class primitiveType) { + if (primitiveType.equals(int.class) || primitiveType.equals(float.class)) { + return 4; + } else if (primitiveType.equals(boolean.class) || primitiveType.equals(byte.class)) { + return 1; + } else if (primitiveType.equals(char.class) || primitiveType.equals(short.class)) { + return 2; + } else if (primitiveType.equals(long.class) || primitiveType.equals(double.class)) { + return 8; + } else { // if (primitiveType.equals(void.class)) { + return 0; + } + } + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/webservice/query/util/OptionallyEncodedString.java b/core/utils/type-utils/src/main/java/datawave/webservice/query/util/OptionallyEncodedString.java new file mode 100644 index 00000000000..6f023c6d39f --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/webservice/query/util/OptionallyEncodedString.java @@ -0,0 +1,119 @@ +package datawave.webservice.query.util; + +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.io.UnsupportedEncodingException; +import java.nio.charset.Charset; + +import javax.xml.bind.annotation.XmlAccessOrder; +import javax.xml.bind.annotation.XmlAccessType; +import javax.xml.bind.annotation.XmlAccessorOrder; +import javax.xml.bind.annotation.XmlAccessorType; +import javax.xml.bind.annotation.XmlAttribute; +import javax.xml.bind.annotation.XmlValue; + +import org.apache.commons.codec.binary.Base64; + +/** + * A JAXB holder class for strings that could possibly contain invalid XML characters. If any invalid XML characters are found, the string will be base64 + * encoded. + *

+ * Note: Consider not using this class directly, but rather keeping your underlying type as a string, and using the + * {@link OptionallyEncodedStringAdapter} instead. Here is an example: + * + *

+ *     @XmlRootElement
+ *     class DataClass {
+ *         @XmlElement(name="DataString")
+ *         @XmlJavaTypeAdapter(OptionallyEncodedStringAdapter.class)
+ *         private String dataString;
+ *         
+ *         ...
+ *     };
+ * 
+ * + * You can work with the {@code dataString} field normally, but it will marshal/unmarshall to the OptionallyEncodedString format. That is, if {@code dataString} + * contained an invalid XML character, then the marshalled XML would look like: + * + *
+ * {@code
+ * 
+ *     
+ *     
+ *         BASE_64_STUFF_HERE
+ *     
+ * }
+ * 
+ * + * @see OptionallyEncodedStringAdapter + */ +@XmlAccessorType(XmlAccessType.NONE) +@XmlAccessorOrder(XmlAccessOrder.ALPHABETICAL) +public class OptionallyEncodedString { + + @XmlAttribute + private Boolean base64Encoded = null; + + @XmlValue + private String value = null; + + public OptionallyEncodedString() {} + + public OptionallyEncodedString(String value) { + setValue(value); + } + + public Boolean getBase64Encoded() { + return base64Encoded; + } + + public String getValue() { + if (this.base64Encoded != null && this.base64Encoded.equals(Boolean.TRUE)) { + byte[] incoming; + String decoded = null; + + try { + incoming = value.getBytes("UTF-8"); + byte[] decodedBytes = Base64.decodeBase64(incoming); + decoded = new String(decodedBytes, Charset.forName("UTF-8")); + } catch (UnsupportedEncodingException e) { + // Should never happen with UTF-8!!! (but if it does we will be + // returning a null) + } + + return decoded; + } else { + return value; + } + } + + public byte[] getValueAsBytes() { + try { + byte[] incoming = value.getBytes("UTF-8"); + if (this.base64Encoded != null && this.base64Encoded.equals(Boolean.TRUE)) { + return Base64.decodeBase64(incoming); + } else { + return incoming; + } + } catch (UnsupportedEncodingException e) { + // Should never happen with UTF-8!!! (but if it does we will be + // returning a null) + } + + // Should never get here + return null; + } + + public void setBase64Encoded(Boolean base64Encoded) { + this.base64Encoded = base64Encoded; + } + + public void setValue(String value) { + if (XMLUtil.isValidXML(value)) { + this.value = value; + } else { + this.value = new String(Base64.encodeBase64(value.getBytes(UTF_8)), UTF_8); + this.base64Encoded = true; + } + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/webservice/query/util/OptionallyEncodedStringAdapter.java b/core/utils/type-utils/src/main/java/datawave/webservice/query/util/OptionallyEncodedStringAdapter.java new file mode 100644 index 00000000000..109cc2766ec --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/webservice/query/util/OptionallyEncodedStringAdapter.java @@ -0,0 +1,51 @@ +package datawave.webservice.query.util; + +import java.io.IOException; + +import javax.xml.bind.annotation.adapters.XmlAdapter; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.databind.DeserializationContext; +import com.fasterxml.jackson.databind.JsonDeserializer; +import com.fasterxml.jackson.databind.JsonSerializer; +import com.fasterxml.jackson.databind.SerializerProvider; + +/** + * An {@link XmlAdapter} that allows a {@link String} property to be bound to XML that is encoded as an {@link OptionallyEncodedStringAdapter}. + * + * @see OptionallyEncodedStringAdapter + */ +public class OptionallyEncodedStringAdapter extends XmlAdapter { + + @Override + public String unmarshal(OptionallyEncodedString v) throws Exception { + return v.getValue(); + } + + @Override + public OptionallyEncodedString marshal(String v) throws Exception { + return (v == null) ? null : new OptionallyEncodedString(v); + } + + public static class Serializer extends JsonSerializer { + @Override + public void serialize(Object obj, JsonGenerator jsonGenerator, SerializerProvider serializerProvider) throws IOException { + // depending on how we get here, we may have the underlying string of an OptionallyEncodedString. Check for both. + if (obj instanceof OptionallyEncodedString) { + jsonGenerator.writeObject((OptionallyEncodedString) obj); + } else { + jsonGenerator.writeObject(new OptionallyEncodedString(String.valueOf(obj))); + } + } + } + + public static class Deserializer extends JsonDeserializer { + + @Override + public String deserialize(JsonParser jsonParser, DeserializationContext deserializationContext) throws IOException { + return jsonParser.readValueAs(OptionallyEncodedString.class).getValue(); + } + } + +} diff --git a/core/utils/type-utils/src/main/java/datawave/webservice/query/util/QueryUncaughtExceptionHandler.java b/core/utils/type-utils/src/main/java/datawave/webservice/query/util/QueryUncaughtExceptionHandler.java new file mode 100644 index 00000000000..8ea75c88d29 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/webservice/query/util/QueryUncaughtExceptionHandler.java @@ -0,0 +1,28 @@ +package datawave.webservice.query.util; + +import java.lang.Thread.UncaughtExceptionHandler; + +public class QueryUncaughtExceptionHandler implements UncaughtExceptionHandler { + + private Thread thread; + private Throwable throwable; + + @Override + public void uncaughtException(Thread t, Throwable e) { + // keep only the first one + if (this.throwable == null) { + synchronized (this) { + this.thread = t; + this.throwable = e; + } + } + } + + public Thread getThread() { + return thread; + } + + public Throwable getThrowable() { + return throwable; + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/webservice/query/util/TypedValue.java b/core/utils/type-utils/src/main/java/datawave/webservice/query/util/TypedValue.java new file mode 100644 index 00000000000..3bdb1b02a49 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/webservice/query/util/TypedValue.java @@ -0,0 +1,520 @@ +package datawave.webservice.query.util; + +import java.io.IOException; +import java.io.Serializable; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.charset.Charset; +import java.util.Calendar; +import java.util.Date; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; + +import javax.xml.bind.DatatypeConverter; +import javax.xml.bind.Unmarshaller; +import javax.xml.bind.annotation.XmlAccessOrder; +import javax.xml.bind.annotation.XmlAccessType; +import javax.xml.bind.annotation.XmlAccessorOrder; +import javax.xml.bind.annotation.XmlAccessorType; +import javax.xml.bind.annotation.XmlAttribute; +import javax.xml.bind.annotation.XmlRootElement; +import javax.xml.bind.annotation.XmlTransient; +import javax.xml.bind.annotation.XmlValue; + +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonProperty; + +import datawave.data.normalizer.DateNormalizer; +import datawave.data.type.DateType; +import datawave.data.type.IpAddressType; +import datawave.data.type.NoOpType; +import datawave.data.type.NumberType; +import datawave.data.type.Type; +import datawave.webservice.query.data.ObjectSizeOf; +import io.protostuff.Input; +import io.protostuff.Message; +import io.protostuff.Output; +import io.protostuff.Schema; + +@XmlRootElement +@XmlAccessorType(XmlAccessType.NONE) +@XmlAccessorOrder(XmlAccessOrder.ALPHABETICAL) +public class TypedValue implements Serializable, Message { + private static final long serialVersionUID = 1987198355354220378L; + + public static final String XSD_BOOLEAN = "xs:boolean"; + public static final String XSD_BYTE = "xs:byte"; + public static final String XSD_DATETIME = "xs:dateTime"; + public static final String XSD_DECIMAL = "xs:decimal"; + public static final String XSD_DOUBLE = "xs:double"; + public static final String XSD_FLOAT = "xs:float"; + public static final String XSD_HEXBINARY = "xs:hexBinary"; + public static final String XSD_BASE64BINARY = "xs:base64Binary"; + public static final String XSD_INT = "xs:int"; + public static final String XSD_INTEGER = "xs:integer"; + public static final String XSD_LONG = "xs:long"; + public static final String XSD_SHORT = "xs:short"; + public static final String XSD_STRING = "xs:string"; + public static final String XSD_IPADDRESS = "xs:ipAddress"; + public static final String MAX_UNICODE_STRING = new String(Character.toChars(Character.MAX_CODE_POINT)); + + @XmlAttribute(required = false) + private Boolean base64Encoded; + + @XmlAttribute + private String type; + + // NOTE: Primitive type info is sometimes lost for Object value using ObjectMapper (de)serialization (i.e. Long becomes Integer, Float becomes + // Double) so only use the marshalledValue + @JsonIgnore + @XmlTransient + private Object value; + + @XmlTransient + private Class dataType; + + @JsonProperty + @XmlValue + private String marshalledValue; + + public TypedValue() {} + + public TypedValue(Object value) { + setDataType(value.getClass()); + setValue(value); + } + + public long sizeInBytes() { + // return the approximate overhead of this class + long size = 28; + // 8 for the object overhead + // 20 for the object references + // all rounded up to the nearest multiple of 8 + size += (base64Encoded == null ? 0 : 16) + sizeInBytes(type) + ObjectSizeOf.Sizer.getObjectSize(value) + sizeInBytes(marshalledValue); + // note we are ignoring Class object + return size; + } + + // a helper method to return the size of a string + protected static long sizeInBytes(String value) { + if (value == null) { + return 0; + } else { + return 24 + roundUp(12 + (value.length() * 2)); + // 24 for 3 ints, array ref, and object overhead + // 12 for array overhead + } + } + + protected static long roundUp(long size) { + long extra = size % 8; + if (extra > 0) { + size = size + 8 - extra; + } + return size; + } + + public Object getValue() { + // NOTE: Proper (de)serialization via ObjectMapper relies on this check to populate value via marshalledValue + if (null != this.marshalledValue && null == this.value) { + afterUnmarshal((Unmarshaller) null, null); + } + return value; + } + + public void setValue(Object value) { + this.value = value; + + Class clazz = value.getClass(); + if (String.class.equals(clazz)) { + String string = (String) value; + // this can happen when a HIT_TERM was created from a Composite field. Remove the composite separator + if (string.contains(MAX_UNICODE_STRING)) { + string = string.replaceAll(MAX_UNICODE_STRING, " "); + } + setMarshalledStringValue(string); + } else if (byte[].class.equals(clazz)) { + this.marshalledValue = DatatypeConverter.printBase64Binary((byte[]) value); + this.type = XSD_BASE64BINARY; + } else if (Boolean.class.equals(clazz)) { + this.marshalledValue = DatatypeConverter.printBoolean((Boolean) value); + this.type = XSD_BOOLEAN; + } else if (Byte.class.equals(clazz)) { + this.marshalledValue = DatatypeConverter.printByte((Byte) value); + this.type = XSD_BYTE; + } else if (Date.class.isAssignableFrom(clazz)) { + Date d = (Date) value; + DateNormalizer dn = new DateNormalizer(); + this.marshalledValue = DatatypeConverter.printString(dn.parseToString(d)); + this.type = XSD_DATETIME; + } else if (Calendar.class.isAssignableFrom(clazz)) { + this.marshalledValue = DatatypeConverter.printDateTime((Calendar) value); + this.type = XSD_DATETIME; + } else if (BigDecimal.class.equals(clazz)) { + this.marshalledValue = DatatypeConverter.printDecimal((BigDecimal) value); + this.type = XSD_DECIMAL; + } else if (Number.class.equals(clazz)) { + this.marshalledValue = DatatypeConverter.printString((String) value); + this.type = XSD_DECIMAL; + } else if (Double.class.equals(clazz)) { + this.marshalledValue = DatatypeConverter.printDouble((Double) value); + this.type = XSD_DOUBLE; + } else if (Float.class.equals(clazz)) { + this.marshalledValue = DatatypeConverter.printFloat((Float) value); + this.type = XSD_FLOAT; + } else if (Integer.class.equals(clazz)) { + this.marshalledValue = DatatypeConverter.printInt((Integer) value); + this.type = XSD_INT; + } else if (BigInteger.class.equals(clazz)) { + this.marshalledValue = DatatypeConverter.printInteger((BigInteger) value); + this.type = XSD_INTEGER; + } else if (Long.class.equals(clazz)) { + this.marshalledValue = DatatypeConverter.printLong((Long) value); + this.type = XSD_LONG; + } else if (Short.class.equals(clazz)) { + this.marshalledValue = DatatypeConverter.printShort((Short) value); + this.type = XSD_SHORT; + } else if (IpAddressType.class.equals(clazz)) { + Type type = (Type) value; + String valueToDisplay = type.getDelegate().toString(); + this.marshalledValue = DatatypeConverter.printString(valueToDisplay); + this.type = XSD_IPADDRESS; + } else if (DateType.class.equals(clazz)) { + Type type = (Type) value; + Date d = (Date) type.getDelegate(); + DateNormalizer dn = new DateNormalizer(); + this.marshalledValue = DatatypeConverter.printString(dn.parseToString(d)); + this.type = XSD_DATETIME; + } else if (NumberType.class.equals(clazz)) { + NumberType dn = (NumberType) value; + BigDecimal bd = dn.getDelegate(); + this.marshalledValue = DatatypeConverter.printDecimal(bd); + this.type = XSD_DECIMAL; + } else if (clazz.toString().contains("IpV4Address")) { + this.marshalledValue = DatatypeConverter.printString(value.toString()); + this.type = XSD_IPADDRESS; + } else if (clazz.toString().contains("IpV6Address")) { + this.marshalledValue = DatatypeConverter.printString(value.toString()); + this.type = XSD_IPADDRESS; + } else if (NoOpType.class.equals(clazz)) { + Type type = (Type) value; + String valueToDisplay = type.getDelegate().toString(); + setMarshalledStringValue(valueToDisplay); + } else if (Type.class.isAssignableFrom(clazz)) { + Type type = (Type) value; + String valueToDisplay = type.getDelegate().toString(); + valueToDisplay = valueToDisplay.replaceAll(MAX_UNICODE_STRING, ""); + setMarshalledStringValue(valueToDisplay); + } else if (LinkedHashMap.class.equals(clazz)) { + // this is a special case when Json serialization is used. The Type objects end up being a map. + setMarshalledStringValue(String.valueOf(((LinkedHashMap) value).get("delegate"))); + } else { + throw new IllegalArgumentException("Unhandled class type: " + clazz.getName()); + } + } + + private void setMarshalledStringValue(String string) { + if (XMLUtil.isValidXML(string)) { + this.marshalledValue = DatatypeConverter.printString(string); + } else { + this.marshalledValue = DatatypeConverter.printBase64Binary(string.getBytes(Charset.forName("UTF-8"))); + base64Encoded = Boolean.TRUE; + } + this.type = XSD_STRING; + } + + public String getType() { + return type; + } + + public void setDataType(Class dataType) { + this.dataType = dataType; + } + + public Class getDataType() { + return this.dataType; + } + + public boolean isBase64Encoded() { + return base64Encoded != null && base64Encoded; + } + + @Override + public String toString() { + StringBuilder buf = new StringBuilder(); + buf.append("TypedValue [base64Encoded=").append(isBase64Encoded()); + buf.append(" type=").append(type); + buf.append(" marshalledValue=").append(marshalledValue); + buf.append(" value= "); + if (null != value) + buf.append(value).append("] "); + else + buf.append("null ]"); + return buf.toString(); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((base64Encoded == null) ? 0 : base64Encoded.hashCode()); + result = prime * result + ((marshalledValue == null) ? 0 : marshalledValue.hashCode()); + result = prime * result + ((type == null) ? 0 : type.hashCode()); + result = prime * result + ((value == null) ? 0 : value.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + TypedValue other = (TypedValue) obj; + if (base64Encoded == null) { + if (other.base64Encoded != null) + return false; + } else if (!base64Encoded.equals(other.base64Encoded)) + return false; + if (marshalledValue == null) { + if (other.marshalledValue != null) + return false; + } else if (!marshalledValue.equals(other.marshalledValue)) + return false; + if (type == null) { + if (other.type != null) + return false; + } else if (!type.equals(other.type)) + return false; + if (value == null) { + if (other.value != null) + return false; + } else if (other.value == null) { + return false; + } else if (value instanceof Type && other.value instanceof Type) { + Type thisValue = (Type) this.value; + Type thatValue = (Type) other.value; + return thisValue.getDelegate().equals(thatValue.getDelegate()); + } else if (!value.equals(other.value)) + return false; + return true; + } + + // Method is called by the JAXB marshalling code + private void afterUnmarshal(Unmarshaller unmarshaller, Object parent) { + if (XSD_STRING.equals(type)) { + if (isBase64Encoded()) { + value = new String(DatatypeConverter.parseBase64Binary(marshalledValue), Charset.forName("UTF-8")); + } else { + value = DatatypeConverter.parseString(marshalledValue); + } + } else if (XSD_HEXBINARY.equals(type)) { + value = DatatypeConverter.parseHexBinary(marshalledValue); + } else if (XSD_BASE64BINARY.equals(type)) { + value = DatatypeConverter.parseBase64Binary(marshalledValue); + } else if (XSD_BOOLEAN.equals(type)) { + value = DatatypeConverter.parseBoolean(marshalledValue); + } else if (XSD_BYTE.equals(type)) { + value = DatatypeConverter.parseBoolean(marshalledValue); + } else if (XSD_DATETIME.equals(type)) { + value = new DateType(marshalledValue).getDelegate(); + } else if (XSD_DECIMAL.equals(type)) { + value = DatatypeConverter.parseDecimal(marshalledValue); + } else if (XSD_DOUBLE.equals(type)) { + value = DatatypeConverter.parseDouble(marshalledValue); + } else if (XSD_FLOAT.equals(type)) { + value = DatatypeConverter.parseFloat(marshalledValue); + } else if (XSD_INT.equals(type)) { + value = DatatypeConverter.parseInt(marshalledValue); + } else if (XSD_INTEGER.equals(type)) { + value = DatatypeConverter.parseInteger(marshalledValue); + } else if (XSD_LONG.equals(type)) { + value = DatatypeConverter.parseLong(marshalledValue); + } else if (XSD_SHORT.equals(type)) { + value = DatatypeConverter.parseShort(marshalledValue); + } else if (XSD_IPADDRESS.equals(type)) { + value = new IpAddressType(marshalledValue).getDelegate(); + } + } + + public static Schema getSchema() { + return SCHEMA; + } + + @Override + public Schema cachedSchema() { + return SCHEMA; + } + + @XmlTransient + private static final Schema SCHEMA = new Schema() { + + @Override + public TypedValue newMessage() { + return new TypedValue(); + } + + @Override + public Class typeClass() { + return TypedValue.class; + } + + @Override + public String messageName() { + return TypedValue.class.getSimpleName(); + } + + @Override + public String messageFullName() { + return TypedValue.class.getName(); + } + + @Override + public boolean isInitialized(TypedValue message) { + return true; + } + + @Override + public void writeTo(Output output, TypedValue message) throws IOException { + Class clazz = message.value.getClass(); + if (Type.class.isAssignableFrom(clazz)) { + message.value = ((Type) message.value).getDelegate(); + clazz = message.value.getClass(); + } + if (String.class.equals(clazz)) { + output.writeString(1, (String) message.value, false); + } else if (Byte.class.equals(clazz)) { + output.writeInt32(2, (Byte) message.value, false); + } else if (byte[].class.equals(clazz)) { + output.writeByteArray(3, (byte[]) message.value, false); + } else if (Date.class.equals(clazz)) { + output.writeInt64(4, ((Date) message.value).getTime(), false); + } else if (Calendar.class.equals(clazz)) { + output.writeInt64(4, ((Calendar) message.value).getTimeInMillis(), false); + } else if (BigDecimal.class.equals(clazz)) { + output.writeString(5, ((BigDecimal) message.value).toString(), false); + } else if (Short.class.equals(clazz)) { + output.writeInt32(6, (Short) message.value, false); + } else if (Integer.class.equals(clazz)) { + output.writeInt32(7, (Integer) message.value, false); + } else if (BigInteger.class.equals(clazz)) { + output.writeString(8, ((BigInteger) message.value).toString(), false); + } else if (Long.class.equals(clazz)) { + output.writeInt64(9, (Long) message.value, false); + } else if (Float.class.equals(clazz)) { + output.writeFloat(10, (Float) message.value, false); + } else if (Double.class.equals(clazz)) { + output.writeDouble(11, (Double) message.value, false); + } else if (Boolean.class.equals(clazz)) { + output.writeBool(12, (Boolean) message.value, false); + } + } + + @Override + public void mergeFrom(Input input, TypedValue message) throws IOException { + int number; + while ((number = input.readFieldNumber(this)) != 0) { + switch (number) { + case 1: + message.setValue(input.readString()); + break; + case 2: + message.setValue((byte) input.readInt32()); + break; + case 3: + message.setValue(input.readByteArray()); + break; + case 4: + Calendar cal = Calendar.getInstance(); + cal.setTimeInMillis(input.readInt64()); + message.setValue(cal); + break; + case 5: + message.setValue(new BigDecimal(input.readString())); + break; + case 6: + message.setValue((short) input.readInt32()); + break; + case 7: + message.setValue(input.readInt32()); + break; + case 8: + message.setValue(new BigInteger(input.readString())); + break; + case 9: + message.setValue(input.readInt64()); + break; + case 10: + message.setValue(input.readFloat()); + break; + case 11: + message.setValue(input.readDouble()); + break; + case 12: + message.setValue(input.readBool()); + break; + default: + input.handleUnknownField(number, this); + break; + } + } + } + + @Override + public String getFieldName(int number) { + switch (number) { + case 1: + return "stringValue"; + case 2: + return "byteValue"; + case 3: + return "byteArrayValue"; + case 4: + return "dateTimeValue"; + case 5: + return "decimalValue"; + case 6: + return "shortValue"; + case 7: + return "intValue"; + case 8: + return "integerValue"; + case 9: + return "longValue"; + case 10: + return "floatValue"; + case 11: + return "doubleValue"; + case 12: + return "booleanValue"; + default: + return null; + } + } + + @Override + public int getFieldNumber(String name) { + final Integer number = fieldMap.get(name); + return number == null ? 0 : number.intValue(); + } + + private final HashMap fieldMap = new HashMap(); + { + fieldMap.put("stringValue", 1); + fieldMap.put("byteValue", 2); + fieldMap.put("byteArrayValue", 3); + fieldMap.put("dateTimeValue", 4); + fieldMap.put("decimalValue", 5); + fieldMap.put("shortValue", 6); + fieldMap.put("intValue", 7); + fieldMap.put("integerValue", 8); + fieldMap.put("longValue", 9); + fieldMap.put("floatValue", 10); + fieldMap.put("doubleValue", 11); + fieldMap.put("booleanValue", 12); + } + }; +} diff --git a/core/utils/type-utils/src/main/java/datawave/webservice/query/util/TypedValueAdapter.java b/core/utils/type-utils/src/main/java/datawave/webservice/query/util/TypedValueAdapter.java new file mode 100644 index 00000000000..5fc9e1b8288 --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/webservice/query/util/TypedValueAdapter.java @@ -0,0 +1,16 @@ +package datawave.webservice.query.util; + +import javax.xml.bind.annotation.adapters.XmlAdapter; + +public class TypedValueAdapter extends XmlAdapter { + + @Override + public Object unmarshal(TypedValue v) throws Exception { + return v.getValue(); + } + + @Override + public TypedValue marshal(Object v) throws Exception { + return new TypedValue(v); + } +} diff --git a/core/utils/type-utils/src/main/java/datawave/webservice/query/util/XMLUtil.java b/core/utils/type-utils/src/main/java/datawave/webservice/query/util/XMLUtil.java new file mode 100644 index 00000000000..b9fc31f3f1a --- /dev/null +++ b/core/utils/type-utils/src/main/java/datawave/webservice/query/util/XMLUtil.java @@ -0,0 +1,27 @@ +package datawave.webservice.query.util; + +public class XMLUtil { + private XMLUtil() { + // prevent construction + } + + public static boolean isValidXML(String s) { + return s.codePoints().allMatch(XMLUtil::isValidXMLChar); + } + + // XML 1.0 spec says the following are valid XML characters: + // #x9 | #xA | #xD | #x20-#xD7FF | #xE000-#xFFFD | #x10000-#x10FFFF + // + // XML 1.1 spec says the following are valid XML characters: + // #x1-#xD7FF | #xE000-#xFFFD | #x10000-#x10FFFF + public static boolean isValidXMLChar(int c) { + // @formatter:off + return c == 0x9 + || c == 0xA + || c == 0xD + || (c >= 0x20 && c <= 0xD7FF) + || (c >= 0xE000 && c <= 0xFFFD) + || (c >= 0x10000 && c <= 0x10FFFF); + // @formatter:on + } +} diff --git a/core/utils/type-utils/src/main/resources/META-INF/services/datawave.data.parser.GeometryParser b/core/utils/type-utils/src/main/resources/META-INF/services/datawave.data.parser.GeometryParser new file mode 100644 index 00000000000..7fca117c49e --- /dev/null +++ b/core/utils/type-utils/src/main/resources/META-INF/services/datawave.data.parser.GeometryParser @@ -0,0 +1 @@ +datawave.data.parser.WKTParser diff --git a/core/utils/type-utils/src/main/resources/source-templates/datawave/webservice/query/util/package-info.java b/core/utils/type-utils/src/main/resources/source-templates/datawave/webservice/query/util/package-info.java new file mode 100644 index 00000000000..1ac08835f87 --- /dev/null +++ b/core/utils/type-utils/src/main/resources/source-templates/datawave/webservice/query/util/package-info.java @@ -0,0 +1,7 @@ +@XmlSchema(namespace="${datawave.webservice.namespace}", elementFormDefault=XmlNsForm.QUALIFIED, xmlns={@XmlNs(prefix = "", namespaceURI = "${datawave.webservice.namespace}")}) +package datawave.webservice.query.util; + +import javax.xml.bind.annotation.XmlNs; +import javax.xml.bind.annotation.XmlNsForm; +import javax.xml.bind.annotation.XmlSchema; + diff --git a/core/utils/type-utils/src/main/spotbugs/excludes.xml b/core/utils/type-utils/src/main/spotbugs/excludes.xml new file mode 100644 index 00000000000..d8229444da6 --- /dev/null +++ b/core/utils/type-utils/src/main/spotbugs/excludes.xml @@ -0,0 +1,12 @@ + + + + + + + + + \ No newline at end of file diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/DateNormalizerTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/DateNormalizerTest.java new file mode 100644 index 00000000000..f2b6d1a7063 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/DateNormalizerTest.java @@ -0,0 +1,183 @@ +package datawave.data.normalizer; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.Set; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; + +import org.apache.log4j.Logger; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import com.google.common.collect.Sets; + +/** + * + * + */ +public class DateNormalizerTest { + + private static final Logger log = Logger.getLogger(DateNormalizerTest.class); + DateNormalizer normalizer = new DateNormalizer(); + + String[] inputDateStrings = {"2014-10-20T00:00:00.0000000", "2014-10-20T00:00:00.000Z", "20141020000000", "2014-10-20 00:00:00GMT", "2014-10-20 00:00:00Z", + "2014-10-20 00:00:00", "2014-10-20", "2014-10-20T00|00", "Mon Oct 20 00:00:00 GMT 2014", "2014-10-20T00:00:00Z", "2014-10-20t00:00:00z", + "2014-10-20T00:00:00+00:00", "Mon Oct 20 00:00:00 +00:00 2014"}; + + @BeforeAll + public static void setupClass() { + System.setProperty("user.timezone", "GMT"); + } + + @Test + public void testAllFormats() { + assertEquals(inputDateStrings.length, DateNormalizer.FORMAT_STRINGS.length, "The DateNormalizer may have an new untested format"); + Set dateSet = Sets.newLinkedHashSet(); + Set normalizedDates = Sets.newLinkedHashSet(); + Set dateTimes = Sets.newLinkedHashSet(); + for (String inputDateString : inputDateStrings) { + Date date = normalizer.denormalize(inputDateString); + dateSet.add(date); + String normalized = normalizer.normalizeDelegateType(date); + normalizedDates.add(normalized); + dateTimes.add(date.getTime()); + } + assertEquals(1, dateSet.size(), "There can be only one dateSet utilized for this unit test."); + assertEquals(1, normalizedDates.size(), "There should be only one normalizedDate produced from this test."); + assertEquals(1, dateTimes.size(), "There should be only one dateTimes produced from this test."); + } + + @Test + public void testExpectedResults() { + String input = "2014-10-20T17:20:20.001Z"; + String normalized = normalizer.normalize(input); + assertEquals("2014-10-20T17:20:20.001Z", normalized); + + input = "20141020172020"; + normalized = normalizer.normalize(input); + assertEquals("2014-10-20T17:20:20.000Z", normalized); + + input = "2014-10-20 17:20:20GMT"; + normalized = normalizer.normalize(input); + assertEquals("2014-10-20T17:20:20.000Z", normalized); + + input = "2014-10-20 17:20:20Z"; + normalized = normalizer.normalize(input); + assertEquals("2014-10-20T17:20:20.000Z", normalized); + + input = "2014-10-20"; + normalized = normalizer.normalize(input); + assertEquals("2014-10-20T00:00:00.000Z", normalized); + + input = "2014-10-20 17:20:20"; + normalized = normalizer.normalize(input); + assertEquals("2014-10-20T17:20:20.000Z", normalized); + + input = "2014-10-20T17|20"; + normalized = normalizer.normalize(input); + assertEquals("2014-10-20T17:20:00.000Z", normalized); + + input = "Mon Oct 20 17:20:20 GMT 2014"; + normalized = normalizer.normalize(input); + assertEquals("2014-10-20T17:20:20.000Z", normalized); + + input = "2014-10-20T17:20:20Z"; + normalized = normalizer.normalize(input); + assertEquals("2014-10-20T17:20:20.000Z", normalized); + + input = "2014-10-20t17:20:20z"; + normalized = normalizer.normalize(input); + assertEquals("2014-10-20T17:20:20.000Z", normalized); + + input = "Thu Jan 1 00:00:00 GMT 1970"; + normalized = normalizer.normalize(input); + assertEquals("1970-01-01T00:00:00.000Z", normalized); + + input = "2014-10-20T17:20:20.345007Z"; + normalized = normalizer.normalize(input); + assertEquals("2014-10-20T17:20:20.345Z", normalized); + + input = "2014-10-20T00:00:00.0000000"; + normalized = normalizer.normalize(input); + assertEquals("2014-10-20T00:00:00.000Z", normalized); + + input = "2014-10-20T00:00:00.1111"; + normalized = normalizer.normalize(input); + assertEquals("2014-10-20T00:00:00.111Z", normalized); + + input = "2014-10-20T00:00:00.222"; + normalized = normalizer.normalize(input); + assertEquals("2014-10-20T00:00:00.222Z", normalized); + } + + @Test + public void testFromLong() { + Date now = new Date(); + long rightNow = now.getTime(); + String normalizedFromLong = normalizer.normalize("" + rightNow); + String normalizedFromDate = normalizer.normalizeDelegateType(now); + assertEquals(normalizedFromLong, normalizedFromDate); + } + + /** + * Show that an un-protected SimpleDateFormat will cause this test to have more than 4 Dates, or cause it to throw an Exception: + */ + @Test + public void showThreadUnsafeDateFormat() { + + try { + DateFormat sdf = new SimpleDateFormat("yyyyMMdd"); + final Date[] thedates = new Date[] {sdf.parse("20170101"), sdf.parse("20170201"), sdf.parse("20170102"), sdf.parse("20160101"), + + }; + final DateFormat unsafeDateFormat = new SimpleDateFormat("yyyyMMdd"); + Callable task = () -> unsafeDateFormat.format(thedates[(int) (Math.random() * 4)]); + + ExecutorService exec = Executors.newFixedThreadPool(2); + List> results = new ArrayList<>(); + for (int i = 0; i < 200; i++) { + results.add(exec.submit(task)); + } + exec.shutdown(); + Set dates = Sets.newHashSet(); + for (Future result : results) { + dates.add(result.get()); + } + log.info("unsafe threading on DateFormat got back this many dates instead of 4:" + dates.size()); + } catch (Exception ex) { + log.info("sometimes, the DateFormat will throw an exception when used in multiple threads:" + ex); + } + } + + /** + * this test uses the ThreadLocal in DateNormalizer to give correct results with multi-threading + */ + @Test + public void testThreadSafeConversions() throws Exception { + DateFormat sdf = new SimpleDateFormat("yyyyMMdd"); + final Date[] thedates = new Date[] {sdf.parse("20170101"), sdf.parse("20170201"), sdf.parse("20170102"), sdf.parse("20160101"), + + }; + Callable task = () -> normalizer.parseToString(thedates[(int) (Math.random() * 4)]); + + ExecutorService exec = Executors.newFixedThreadPool(2); + List> results = new ArrayList<>(); + for (int i = 0; i < 200; i++) { + results.add(exec.submit(task)); + } + exec.shutdown(); + Set dates = Sets.newHashSet(); + for (Future result : results) { + dates.add(result.get()); + } + assertEquals(4, dates.size()); + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/GeometryNormalizerTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/GeometryNormalizerTest.java new file mode 100644 index 00000000000..0b6fdd98e84 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/GeometryNormalizerTest.java @@ -0,0 +1,131 @@ +package datawave.data.normalizer; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.fail; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.commons.codec.binary.Hex; +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.locationtech.geowave.core.geotime.util.GeometryUtils; +import org.locationtech.geowave.core.index.ByteArrayRange; +import org.locationtech.geowave.core.index.sfc.data.MultiDimensionalNumericData; +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.Geometry; +import org.locationtech.jts.geom.GeometryFactory; +import org.locationtech.jts.io.WKTWriter; + +import com.google.common.collect.Lists; + +public class GeometryNormalizerTest { + + private GeometryNormalizer normalizer = null; + + @BeforeEach + public void setup() { + normalizer = new GeometryNormalizer(); + } + + @Test + public void testPoint() { + Geometry point = new GeometryFactory().createPoint(new Coordinate(10, 10)); + List insertionIds = new ArrayList<>(normalizer.expand(new WKTWriter().write(point))); + assertEquals(1, insertionIds.size()); + assertEquals("1f200a80a80a80a80a", insertionIds.get(0)); + } + + @Test + public void testLine() { + Geometry line = new GeometryFactory().createLineString(new Coordinate[] {new Coordinate(-10, -10), new Coordinate(0, 0), new Coordinate(10, 20)}); + List insertionIds = new ArrayList<>(normalizer.expand(new WKTWriter().write(line))); + Collections.sort(insertionIds); + assertEquals(4, insertionIds.size()); + assertEquals("042a", insertionIds.get(0)); + assertEquals("047f", insertionIds.get(1)); + assertEquals("0480", insertionIds.get(2)); + assertEquals("04d5", insertionIds.get(3)); + } + + @Test + public void testPolygon() { + Geometry polygon = new GeometryFactory().createPolygon(new Coordinate[] {new Coordinate(-10, -10), new Coordinate(10, -10), new Coordinate(10, 10), + new Coordinate(-10, 10), new Coordinate(-10, -10)}); + List insertionIds = new ArrayList<>(normalizer.expand(new WKTWriter().write(polygon))); + assertEquals(4, insertionIds.size()); + assertEquals("0500aa", insertionIds.get(0)); + assertEquals("0501ff", insertionIds.get(1)); + assertEquals("050200", insertionIds.get(2)); + assertEquals("050355", insertionIds.get(3)); + } + + @Test + public void testWKTPoint() { + Geometry geom = AbstractGeometryNormalizer.parseGeometry("POINT(10 20)"); + assertEquals(10.0, geom.getGeometryN(0).getCoordinate().x, 0.0); + assertEquals(20.0, geom.getGeometryN(0).getCoordinate().y, 0.0); + + List insertionIds = new ArrayList<>(normalizer.expand(new WKTWriter().write(geom))); + assertEquals(1, insertionIds.size()); + assertEquals("1f20306ba4306ba430", insertionIds.get(0)); + } + + @Test + public void testWKTPointz() { + Geometry geom = AbstractGeometryNormalizer.parseGeometry("POINT Z(10 20 30)"); + assertEquals(10.0, geom.getGeometryN(0).getCoordinate().x, 0.0); + assertEquals(20.0, geom.getGeometryN(0).getCoordinate().y, 0.0); + assertEquals(30.0, geom.getGeometryN(0).getCoordinate().z, 0.0); + + List insertionIds = new ArrayList<>(normalizer.expand(new WKTWriter().write(geom))); + assertEquals(1, insertionIds.size()); + assertEquals("1f20306ba4306ba430", insertionIds.get(0)); + } + + @Test + public void testQueryRanges() throws Exception { + Geometry polygon = new GeometryFactory().createPolygon(new Coordinate[] {new Coordinate(-10, -10), new Coordinate(10, -10), new Coordinate(10, 10), + new Coordinate(-10, 10), new Coordinate(-10, -10)}); + + List allRanges = new ArrayList<>(); + for (MultiDimensionalNumericData range : GeometryUtils.basicConstraintsFromEnvelope(polygon.getEnvelopeInternal()) + .getIndexConstraints(GeometryNormalizer.getGeometryIndex())) { + allRanges.addAll(Lists.reverse(GeometryNormalizer.getGeometryIndexStrategy().getQueryRanges(range).getCompositeQueryRanges())); + } + + assertEquals(3746, allRanges.size()); + + StringBuffer result = new StringBuffer(); + for (ByteArrayRange range : allRanges) { + result.append(Hex.encodeHexString(range.getStart())); + result.append(Hex.encodeHexString(range.getEnd())); + } + + String expected = IOUtils.toString(this.getClass().getClassLoader().getResourceAsStream("datawave/data/normalizer/geoRanges.txt"), "UTF8"); + + assertEquals(expected, result.toString()); + } + + @Test + public void testHash() { + String[] validHashes = new String[] {"00", "0100", "020d", "031b", "04df", "05031e", "0604ff", "0713ff", "08c7fe", "09023fff", "0a04ffff", "0b0dffff", + "0c8fffff", "0d01c00000", "0e0b000000", "0f0dfffffe", "1037ffffff", "11023fffffff", "1208ffffffff", "131c00000000", "1437ffffffff", + "15023fffffffff", "16070000000000", "1723ffffffffff", "188fffffffffff", "19013fffffffffff", "1a08ffffffffffff", "1b1c000000000000", + "1c4fffffffffffff", "1d01c0000000000000", "1e0700000000000000", "1f0dffffffffffffff"}; + String[] invalidHashes = new String[] {"0", "0001", "01", "1fffffffffffffffff", "200dffffffffffffff", "1c4fffffffffffffff"}; + for (String hash : validHashes) { + assertEquals(hash, normalizer.normalize(hash)); + } + for (String hash : invalidHashes) { + try { + normalizer.normalize(hash); + fail("Should have failed to normalize " + hash); + } catch (Exception e) { + // this is expected + } + } + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/HexStringNormalizerTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/HexStringNormalizerTest.java new file mode 100644 index 00000000000..155ffa29e6e --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/HexStringNormalizerTest.java @@ -0,0 +1,60 @@ +package datawave.data.normalizer; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import org.junit.jupiter.api.Test; + +public class HexStringNormalizerTest { + + private final HexStringNormalizer normalizer = new HexStringNormalizer(); + + @Test + public void testAllHexCharacters() { + assertEquals("1234567890abcdefabcdef", normalizer.normalize("1234567890abcdefABCDEF"), "Test all hex characters"); + assertEquals("1234567890abcdefabcdef", normalizer.normalize("0x1234567890abcdefABCDEF"), "Test all hex characters w/0x"); + } + + @Test + public void testOddLenghtValidHexString() { + assertEquals("0123", normalizer.normalize("123"), "Test odd length"); + assertEquals("0123", normalizer.normalize("0x123"), "Test odd length w/0x"); + assertEquals("0abcde", normalizer.normalize("abCde"), "Test odd length"); + assertEquals("0abcde", normalizer.normalize("0xabCde"), "Test odd length w/0x"); + } + + @Test + public void testInvalidHexStringEmpty() { + assertThrows(IllegalArgumentException.class, () -> normalizer.normalize("")); + } + + @Test + public void testInvalidHexStringPrefixOnly() { + assertThrows(IllegalArgumentException.class, () -> normalizer.normalize("0x"), "Test invalid hex string w/0x"); + } + + @Test + public void testInvalidHexStringNotHex() { + assertThrows(IllegalArgumentException.class, () -> normalizer.normalize("Not Hex"), "Test invalid hex string"); + } + + @Test + public void testInvalidHexStringWithG() { + assertThrows(IllegalArgumentException.class, () -> normalizer.normalize("aBcDeFg12345"), "Test invalid hex string"); + } + + @Test + public void testConvertFieldRegexEmpty() { + assertThrows(IllegalArgumentException.class, () -> normalizer.normalizeRegex("")); + } + + @Test + public void testConvertFieldRegexToLower() { + assertEquals("1234567890abcdefabcdef", normalizer.normalizeRegex("1234567890abcdefABCDEF"), "Test convertFieldRegex"); + } + + @Test + public void testConvertFieldRegexNull() { + assertThrows(IllegalArgumentException.class, () -> normalizer.normalizeRegex(null)); + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/IpAddressNormalizerTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/IpAddressNormalizerTest.java new file mode 100644 index 00000000000..e6eef6cb68e --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/IpAddressNormalizerTest.java @@ -0,0 +1,124 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + +package datawave.data.normalizer; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import org.apache.log4j.Logger; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +import datawave.data.type.util.IpV4Address; + +/** + * + */ +public class IpAddressNormalizerTest { + private static Logger log = Logger.getLogger(IpAddressNormalizerTest.class); + + @Test + public void testIpNormalizer01() { + String ip = "1.2.3.4"; + String expected = "001.002.003.004"; + IpAddressNormalizer norm = new IpAddressNormalizer(); + String result = norm.normalize(ip); + assertEquals(expected, result); + log.debug("result: " + result); + } + + @Test + public void testIpNormalizer02() { + String ip = "1.2.3"; + IpAddressNormalizer norm = new IpAddressNormalizer(); + assertThrows(IllegalArgumentException.class, () -> norm.normalize(ip)); + } + + @Test + public void testIpNormalizer03() { + IpAddressNormalizer norm = new IpAddressNormalizer(); + if (log.isDebugEnabled()) { + log.debug("testIpNormalizer03"); + log.debug(norm.normalize("1.2.3.*")); + log.debug(norm.normalize("1.2.3..*")); + log.debug(norm.normalize("1.2.*")); + log.debug(norm.normalize("1.2..*")); + log.debug(norm.normalize("1.*")); + log.debug(norm.normalize("1..*")); + + } + assertEquals("001.002.003.*", norm.normalize("1.2.3.*")); + assertEquals("001.002.003.*", norm.normalize("1.2.3..*")); + assertEquals("001.002.*", norm.normalize("1.2.*")); + assertEquals("001.002.*", norm.normalize("1.2..*")); + assertEquals("001.*", norm.normalize("1.*")); + assertEquals("001.*", norm.normalize("1..*")); + } + + @Test + public void testIpNormalizer04() { + log.debug("testIpNormalizer04"); + IpAddressNormalizer norm = new IpAddressNormalizer(); + log.debug(norm.normalize("*.2.13.4")); + log.debug(norm.normalize("*.13.4")); + assertEquals("*.002.013.004", norm.normalize("*.2.13.4")); + assertEquals("*.013.004", norm.normalize("*.13.4")); + } + + // TEST IS TURNED OFF + @Test + @Disabled + public void testIpNormalizer05() { + log.debug("testIpNormalizer05"); + IpV4Address ip = IpV4Address.parse("*.2.13.4"); + if (log.isDebugEnabled()) { + log.debug(ip.toString()); + log.debug(ip.toZeroPaddedString()); + log.debug(ip.toReverseString()); + log.debug(ip.toReverseZeroPaddedString()); + } + } + + /* + * NOTE: call toReverseString() on a wildcarded ip doesn't work right although this is not much of an issue. + */ + // TEST IS TURNED OFF + @Test + @Disabled + public void testIpNormalizer06() { + log.debug("testIpNormalizer06"); + IpV4Address ip = IpV4Address.parse("1.2.*"); + if (log.isDebugEnabled()) { + log.debug(ip.toString()); + log.debug(ip.toZeroPaddedString()); + log.debug(ip.toReverseString()); + log.debug(ip.toReverseZeroPaddedString()); + } + } + + @Test + public void testIpNormalizer07() { + log.debug("testIpNormalizer07"); + IpAddressNormalizer norm = new IpAddressNormalizer(); + log.debug(norm.normalize(" *.2. 13.4")); + log.debug(norm.normalize(" *.13.4 ")); + assertEquals("*.002.013.004", norm.normalize(" *.2. 13.4")); + assertEquals("*.013.004", norm.normalize(" *.13.4 ")); + } + + @Test + public void testCidrTranslations() { + log.debug("testCidrTranslations"); + IpAddressNormalizer norm = new IpAddressNormalizer(); + assertArrayEquals(norm.normalizeCidrToRange("1.2.3.4/32"), new String[] {"001.002.003.004", "001.002.003.004"}); + assertArrayEquals(norm.normalizeCidrToRange("1.2.3.0/24"), new String[] {"001.002.003.000", "001.002.003.255"}); + assertArrayEquals(norm.normalizeCidrToRange("1.2.0.0/16"), new String[] {"001.002.000.000", "001.002.255.255"}); + assertArrayEquals(norm.normalizeCidrToRange("1.0.0.0/8"), new String[] {"001.000.000.000", "001.255.255.255"}); + assertArrayEquals(norm.normalizeCidrToRange("1.2.3.4/30"), new String[] {"001.002.003.004", "001.002.003.007"}); + + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/LcNoDiacriticsNormalizerTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/LcNoDiacriticsNormalizerTest.java new file mode 100644 index 00000000000..c06e636ecd0 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/LcNoDiacriticsNormalizerTest.java @@ -0,0 +1,17 @@ +package datawave.data.normalizer; + +import static org.junit.jupiter.api.Assertions.assertNull; + +import org.junit.jupiter.api.Test; + +public class LcNoDiacriticsNormalizerTest { + @Test + public void test1() { + LcNoDiacriticsNormalizer norm = new LcNoDiacriticsNormalizer(); + String b = null; + String n1 = norm.normalize(b); + + assertNull(n1); + + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/NormalizationExceptionTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/NormalizationExceptionTest.java new file mode 100644 index 00000000000..0f5b7454faf --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/NormalizationExceptionTest.java @@ -0,0 +1,60 @@ +package datawave.data.normalizer; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import datawave.data.type.NumberType; + +public class NormalizationExceptionTest { + + private NormalizationException ne; + private Throwable throwable; + private String message; + + @BeforeEach + public void beforeTests() { + message = "NormalizationException (hint: it's your fault)"; + throwable = new Throwable(message); + } + + @Test + public void testPreEncodedValue() { + NumberType type = new NumberType(); + assertEquals("+cE1.23", type.normalize("+cE1.23")); + } + + @Test + public void testEmptyConstructor() { + ne = new NormalizationException(); + + assertNull(ne.getMessage()); + assertNull(ne.getLocalizedMessage()); + } + + @Test + public void testMessageThrowableConstructor() { + ne = new NormalizationException(message, throwable); + + assertEquals(message, ne.getMessage()); + assertEquals(message, ne.getLocalizedMessage()); + } + + @Test + public void testMessageConstructor() { + ne = new NormalizationException(message); + + assertEquals(message, ne.getMessage()); + assertEquals(message, ne.getLocalizedMessage()); + } + + @Test + public void testThrowableConstructor() { + ne = new NormalizationException(throwable); + + assertEquals(throwable.toString(), ne.getMessage()); + assertEquals(throwable.toString(), ne.getLocalizedMessage()); + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/NumberNormalizerTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/NumberNormalizerTest.java new file mode 100644 index 00000000000..3738b9ce61f --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/NumberNormalizerTest.java @@ -0,0 +1,264 @@ +package datawave.data.normalizer; + +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.math.BigDecimal; +import java.math.RoundingMode; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.ThreadLocalRandom; +import java.util.regex.Pattern; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +public class NumberNormalizerTest { + + private final NumberNormalizer normalizer = new NumberNormalizer(); + private final ThreadLocalRandom random = ThreadLocalRandom.current(); + + /** + * Verify that the equivalent numbers 1 and 1.00000000 are normalized to the same encoding. + */ + @Test + public void testNormalizingEquivalentWholeAndDecimalNumber() { + String expected = "+aE1"; + assertNormalizeResult("1", expected); + assertNormalizeResult("1.00000000", expected); + } + + /** + * Verify that a negative number is normalized to the correct encoding. + */ + @Test + public void testNormalizingNegativeDecimal() { + String expected = "!ZE9"; + assertNormalizeResult("-1.0", expected); + + assertComparativelyConsecutive(expected, normalizer.normalize("1.0")); + } + + /** + * Verify that three different numbers are normalized to their correct respective encodings, and also verify that their encodings evaluate to the same + * consecutive order as the original numbers. + */ + @Test + public void testNormalizingNegativeToPositiveRangeInThousandths() { + String expected1 = "!dE9"; + assertNormalizeResult("-0.0001", expected1); + + String expected2 = "+AE0"; + assertNormalizeResult("0", expected2); + + String expected3 = "+VE1"; + assertNormalizeResult("0.00001", expected3); + + assertComparativelyConsecutive(expected1, expected2, expected3); + } + + /** + * Verify that large numbers are correctly normalized, and that their encodings evaluate to the same consecutive order as the original numbers. + */ + @Test + public void testNormalizingMaxIntegerValue() { + String expected1 = "+jE2.147483647"; + assertNormalizeResult(Integer.toString(Integer.MAX_VALUE), "+jE2.147483647"); + + String expected2 = "+jE2.147483646"; + assertNormalizeResult(Integer.toString(Integer.MAX_VALUE - 1), "+jE2.147483646"); + + assertComparativelyConsecutive(expected2, expected1); + } + + /** + * Verify that two numbers that are equal, but one with extra zeroes, are normalized to the same encoding. + */ + @Test + public void testNormalizingEqualNumbersWithExtraZeroes() { + String expected = "!dE1"; + assertNormalizeResult("-0.0009", expected); + assertNormalizeResult("-0.00090", expected); + } + + /** + * Verify that different forms of zero will normalize to the same encoding. + */ + @Test + public void testNormalizingEquivalentZeroes() { + assertNormalizeResult("-0.0", "+AE0"); + assertNormalizeResult("0", "+AE0"); + assertNormalizeResult("0.0", "+AE0"); + } + + private void assertNormalizeResult(String input, String expected) { + assertEquals(normalizer.normalize(input), expected); + } + + private void assertComparativelyConsecutive(String... values) { + for (int i = 0; i < values.length - 1; i++) { + int compare = values[i].compareTo(values[i + 1]); + if (compare > 0) { + Assertions.fail("Expected values to be consecutive, but encountered " + values[i] + " which is greater than " + values[i + 1]); + } + } + } + + /** + * Generate random numbers and corresponding regex patterns, and verify that the patterns match against the numbers, and that the corresponding normalized + * regex patterns match against the corresponding normalized numbers. + */ + @Test + void testRandomRegexPatterns() { + for (int i = 0; i < 1000; i++) { + // Get a random number. Call getFastRandomNumber() for a quick test that takes less than a minute to complete. Call getRandomNumber() to get numbers + // that are random across a much larger scale, but expect the test to take possibly more than 20 minutes to complete. + String num = getFastRandomNumber(); + String normalizedNum = normalizer.normalize(num); + + // Generate 100 random patterns that should match against the number. + for (int j = 0; j < 100; j++) { + StringBuilder pattern = new StringBuilder(); + + int startPos = 0; + // Randomly start the regex with .* + if (random.nextBoolean()) { + pattern.append(".*"); + // If the number originally started with a '-', skip over it when appending to the pattern. + if (num.charAt(0) == '-') { + startPos = 1; + } + } + + boolean seenDecimal = false; + for (int pos = startPos; pos < num.length(); pos++) { + char character = num.charAt(pos); + if (Character.isDigit(character)) { + if (random.nextBoolean()) { + Set candidates = new HashSet<>(); + for (int count = 0; count < 10; count++) { + if (random.nextBoolean()) { + candidates.add(random.nextInt(10)); + } + } + candidates.add(Integer.valueOf(String.valueOf(character))); + pattern.append('['); + candidates.forEach(pattern::append); + pattern.append(']'); + } else if (random.nextBoolean()) { + pattern.append('.'); + } else if (random.nextBoolean()) { + pattern.append("\\d"); + } else { + pattern.append(character); + } + if (random.nextBoolean()) { + pattern.append("*"); + } else if (random.nextBoolean()) { + pattern.append("+"); + } else if (random.nextBoolean()) { + pattern.append("{1,3}"); + } + } else if (character == '.') { + seenDecimal = true; + pattern.append("\\."); + } else { + pattern.append(character); + } + } + + // If we've seen a decimal point, randomly append a trailing .* + if (seenDecimal && random.nextBoolean()) { + pattern.append(".*"); + } + + // Verify the pattern matches the original number. + assertThat(Pattern.compile(pattern.toString()).matcher(num).matches()).as("matching \n\"" + pattern + "\"\n to " + num).isTrue(); + + // Normalize the pattern. + String normalizedPattern = normalizer.normalizeRegex(pattern.toString()); + + // check the normalized match + assertThat(Pattern.compile(normalizedPattern).matcher(normalizedNum).matches()) + .as("matching \n\"" + pattern + "\" -> \n\"" + normalizedPattern + "\"\n to " + num + " -> " + normalizedNum).isTrue(); + + // reormalize the pattern. + String renormalizedPattern = normalizer.normalizeRegex(normalizedPattern); + assertEquals(renormalizedPattern, normalizedPattern); + } + } + } + + /** + * Return a random number that when used in {@link #testRandomRegexPatterns()}, will not make the test take more than a minute to complete. + * + * @return a random number + */ + private String getFastRandomNumber() { + String num = Double.toString(random.nextDouble()); + if (num.contains("E")) { + num = Double.toString(random.nextDouble()); + } + return num; + } + + /** + * Return a random number. Note: when used in {@link #testRandomRegexPatterns()}, the test can take more than 20 minutes to complete. + * + * @return a random number + */ + private String getRandomNumber() { + return random.nextBoolean() ? getRandomNumberLessThanZero() : getRandomNumberGreaterThanZero(); + } + + /** + * Return a random number that is larger than zero, randomly negative, and randomly whole. + * + * @return a random number + */ + private String getRandomNumberGreaterThanZero() { + BigDecimal decimal = getRandomBigDecimal(); + + // Move the decimal point to the right randomly. + int leadingZeros = random.nextInt(0, 26); + decimal = decimal.movePointRight(leadingZeros); + + // Randomly trim the mantissa to make the number whole. + if (random.nextBoolean()) { + decimal = decimal.setScale(0, RoundingMode.FLOOR); + } + + return decimal.toPlainString(); + } + + /** + * Return a random number that is less than zero, and randomly negative. + * + * @return a random number + */ + private String getRandomNumberLessThanZero() { + BigDecimal decimal = getRandomBigDecimal(); + + // Move the decimal point to the left randomly. + int leadingZeros = random.nextInt(0, 26); + decimal = decimal.movePointLeft(leadingZeros); + + // Limit the mantissa length. + decimal = decimal.setScale(26, RoundingMode.FLOOR); + + return decimal.toPlainString(); + } + + /** + * Return a random big decimal that is randomly negative. + * + * @return a new big decimal + */ + private BigDecimal getRandomBigDecimal() { + BigDecimal decimal = BigDecimal.valueOf(random.nextDouble()); + if (random.nextBoolean()) { + decimal = decimal.negate(); + } + return decimal; + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/PointNormalizerTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/PointNormalizerTest.java new file mode 100644 index 00000000000..6ce1e62639c --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/PointNormalizerTest.java @@ -0,0 +1,159 @@ +package datawave.data.normalizer; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.codec.binary.Hex; +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.locationtech.geowave.core.geotime.util.GeometryUtils; +import org.locationtech.geowave.core.index.ByteArrayRange; +import org.locationtech.geowave.core.index.sfc.data.MultiDimensionalNumericData; +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.Geometry; +import org.locationtech.jts.geom.GeometryFactory; +import org.locationtech.jts.io.WKTWriter; + +import com.google.common.collect.Lists; + +public class PointNormalizerTest { + + private PointNormalizer pointNormalizer = null; + private GeometryNormalizer geometryNormalizer = null; + + @BeforeEach + public void setup() { + pointNormalizer = new PointNormalizer(); + geometryNormalizer = new GeometryNormalizer(); + } + + @Test + public void testPoint() { + Geometry point = new GeometryFactory().createPoint(new Coordinate(10, 10)); + List insertionIds = new ArrayList<>(pointNormalizer.expand(new WKTWriter().write(point))); + assertEquals(1, insertionIds.size()); + assertEquals("1f200a80a80a80a80a", insertionIds.get(0)); + + // make sure the insertion id matches the geo normalizer + List geoInsertionIds = new ArrayList<>(geometryNormalizer.expand(new WKTWriter().write(point))); + assertEquals(1, geoInsertionIds.size()); + assertEquals(insertionIds.get(0), geoInsertionIds.get(0)); + } + + @Test + public void testLine() { + Geometry line = new GeometryFactory().createLineString(new Coordinate[] {new Coordinate(-10, -10), new Coordinate(0, 0), new Coordinate(10, 20)}); + assertThrows(ClassCastException.class, () -> pointNormalizer.expand(new WKTWriter().write(line))); + } + + @Test + public void testPolygon() { + Geometry polygon = new GeometryFactory().createPolygon(new Coordinate[] {new Coordinate(-10, -10), new Coordinate(10, -10), new Coordinate(10, 10), + new Coordinate(-10, 10), new Coordinate(-10, -10)}); + assertThrows(ClassCastException.class, () -> pointNormalizer.expand(new WKTWriter().write(polygon))); + } + + @Test + public void testWKTPoint() { + Geometry geom = AbstractGeometryNormalizer.parseGeometry("POINT(10 20)"); + assertEquals(10.0, geom.getGeometryN(0).getCoordinate().x, 0.0); + assertEquals(20.0, geom.getGeometryN(0).getCoordinate().y, 0.0); + + List insertionIds = new ArrayList<>(pointNormalizer.expand(new WKTWriter().write(geom))); + assertEquals(1, insertionIds.size()); + assertEquals("1f20306ba4306ba430", insertionIds.get(0)); + + // make sure the insertion id matches the geo normalizer + List geoInsertionIds = new ArrayList<>(geometryNormalizer.expand(new WKTWriter().write(geom.getCentroid()))); + assertEquals(1, geoInsertionIds.size()); + assertEquals(insertionIds.get(0), geoInsertionIds.get(0)); + } + + @Test + public void testWKTPointz() { + Geometry geom = AbstractGeometryNormalizer.parseGeometry("POINT Z(10 20 30)"); + assertEquals(10.0, geom.getGeometryN(0).getCoordinate().x, 0.0); + assertEquals(20.0, geom.getGeometryN(0).getCoordinate().y, 0.0); + assertEquals(30.0, geom.getGeometryN(0).getCoordinate().z, 0.0); + + List insertionIds = new ArrayList<>(pointNormalizer.expand(new WKTWriter().write(geom))); + assertEquals(1, insertionIds.size()); + assertEquals("1f20306ba4306ba430", insertionIds.get(0)); + + // make sure the insertion id matches the geo normalizer + List geoInsertionIds = new ArrayList<>(geometryNormalizer.expand(new WKTWriter().write(geom.getCentroid()))); + assertEquals(1, geoInsertionIds.size()); + assertEquals(insertionIds.get(0), geoInsertionIds.get(0)); + } + + @Test + public void testQueryRanges() throws Exception { + Geometry polygon = new GeometryFactory().createPolygon(new Coordinate[] {new Coordinate(-10, -10), new Coordinate(10, -10), new Coordinate(10, 10), + new Coordinate(-10, 10), new Coordinate(-10, -10)}); + + List allRanges = new ArrayList<>(); + for (MultiDimensionalNumericData range : GeometryUtils.basicConstraintsFromEnvelope(polygon.getEnvelopeInternal()) + .getIndexConstraints(PointNormalizer.getPointIndex())) { + allRanges.addAll(Lists.reverse(PointNormalizer.getPointIndexStrategy().getQueryRanges(range).getCompositeQueryRanges())); + } + + assertEquals(171, allRanges.size()); + + StringBuilder result = new StringBuilder(); + for (ByteArrayRange range : allRanges) { + result.append(Hex.encodeHexString(range.getStart())); + result.append(Hex.encodeHexString(range.getEnd())); + } + + String expected = IOUtils.toString(this.getClass().getClassLoader().getResourceAsStream("datawave/data/normalizer/pointRanges.txt"), "UTF8"); + + assertEquals(expected, result.toString()); + } + + @Test + public void testPointQueryRangesMatchGeoQueryRanges() { + Geometry polygon = new GeometryFactory().createPolygon(new Coordinate[] {new Coordinate(-10, -10), new Coordinate(10, -10), new Coordinate(10, 10), + new Coordinate(-10, 10), new Coordinate(-10, -10)}); + + List allPointRanges = new ArrayList<>(); + for (MultiDimensionalNumericData range : GeometryUtils.basicConstraintsFromEnvelope(polygon.getEnvelopeInternal()) + .getIndexConstraints(PointNormalizer.getPointIndex())) { + allPointRanges.addAll(Lists.reverse(PointNormalizer.getPointIndexStrategy().getQueryRanges(range).getCompositeQueryRanges())); + } + + assertEquals(171, allPointRanges.size()); + + StringBuilder pointResult = new StringBuilder(); + for (ByteArrayRange range : allPointRanges) { + pointResult.append(Hex.encodeHexString(range.getStart())); + pointResult.append(Hex.encodeHexString(range.getEnd())); + } + + List allGeoRanges = new ArrayList<>(); + for (MultiDimensionalNumericData range : GeometryUtils.basicConstraintsFromEnvelope(polygon.getEnvelopeInternal()) + .getIndexConstraints(GeometryNormalizer.getGeometryIndex())) { + allGeoRanges.addAll(Lists.reverse(GeometryNormalizer.getGeometryIndexStrategy().getQueryRanges(range).getCompositeQueryRanges())); + } + + assertEquals(3746, allGeoRanges.size()); + + int numPointRanges = 0; + StringBuilder geoResult = new StringBuilder(); + for (ByteArrayRange range : allGeoRanges) { + String start = Hex.encodeHexString(range.getStart()); + String end = Hex.encodeHexString(range.getEnd()); + if (start.startsWith("1f") && end.startsWith("1f")) { + geoResult.append(start); + geoResult.append(end); + numPointRanges++; + } + } + + assertEquals(171, numPointRanges); + assertEquals(geoResult.toString(), pointResult.toString()); + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/NodeAssert.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/NodeAssert.java new file mode 100644 index 00000000000..fa513af5a3b --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/NodeAssert.java @@ -0,0 +1,337 @@ +package datawave.data.normalizer.regex; + +import java.util.Objects; + +import org.assertj.core.api.AbstractAssert; +import org.assertj.core.api.AbstractStringAssert; +import org.assertj.core.api.Assertions; + +import datawave.data.normalizer.regex.visitor.EqualityVisitor; +import datawave.data.normalizer.regex.visitor.PrintVisitor; +import datawave.data.normalizer.regex.visitor.StringVisitor; + +public class NodeAssert,ACTUAL extends Node> extends AbstractAssert { + + public static NodeAssert assertThat(Node node) { + return new NodeAssert<>(node); + } + + protected NodeAssert(ACTUAL actual) { + super(actual, NodeAssert.class); + } + + protected NodeAssert(ACTUAL actual, Class selfType) { + super(actual, selfType); + } + + public NodeAssert hasNullParent() { + isNotNull(); + Node parent = actual.getParent(); + if (parent != null) { + failWithMessage("Expected parent to be null, but was %s", parent); + } + return this; + } + + public NodeAssert hasNonNullParent() { + isNotNull(); + if (actual.getParent() == null) { + failWithMessage("Expected parent to be non-null"); + } + return this; + } + + public NodeAssert hasChildCount(int count) { + isNotNull(); + int actualCount = actual.getChildCount(); + if (count != actualCount) { + failWithMessage("Expected child count to be %d, but was %d", count, actualCount); + } + return this; + } + + public NodeAssert hasNoChildren() { + return hasChildCount(0); + } + + public NodeAssert assertChild(int childIndex) { + isNotNull(); + int childCount = actual.getChildCount(); + if (childIndex >= childCount) { + failWithMessage("Expected to find child at index %d but there are %d children", childIndex, childCount); + } + Node child = actual.getChildAt(childIndex); + return assertThat(child); + } + + public NodeAssert assertParent() { + isNotNull(); + return assertThat(actual.getParent()); + } + + public NodeAssert assertGrandparent() { + isNotNull(); + return assertParent().assertParent(); + } + + public NodeAssert isEqualTreeTo(Node expected) { + if (!EqualityVisitor.isEqual(actual, expected)) { + String actualString = StringVisitor.toString(actual); + String actualPrint = PrintVisitor.printToString(actual); + String expectedString = StringVisitor.toString(expected); + String expectedPrint = PrintVisitor.printToString(expected); + failWithMessage("Expected actual '%s'\n%s\n\nto be equal to expected '%s'\n%s\n\nbut were different trees", actualString, actualPrint, + expectedString, expectedPrint); + } + return this; + } + + public NodeAssert isNotEqualTreeTo(Node node) { + if (EqualityVisitor.isEqual(actual, node)) { + failWithMessage("Expected %s to not be equal to %s, but were identical trees"); + } + return this; + } + + public NodeAssert isExpressionNode() { + isInstanceOf(ExpressionNode.class); + return this; + } + + public NodeAssert isAlternationNode() { + isInstanceOf(AlternationNode.class); + return this; + } + + public NodeAssert isGroupNode() { + isInstanceOf(GroupNode.class); + return this; + } + + public CharClassNodeAssert isCharClassNode() { + isInstanceOf(CharClassNode.class); + return new CharClassNodeAssert((CharClassNode) actual); + } + + public NodeAssert isDigitCharClassNode() { + isInstanceOf(DigitCharClassNode.class); + return this; + } + + public NodeAssert isRepetitionNode() { + isInstanceOf(RepetitionNode.class); + return this; + } + + public NodeAssert isAnyCharNode() { + isInstanceOf(AnyCharNode.class); + return this; + } + + public NodeAssert isZeroToManyNode() { + isInstanceOf(ZeroOrMoreNode.class); + return this; + } + + public NodeAssert isOneToManyNode() { + isInstanceOf(OneOrMoreNode.class); + return this; + } + + public NodeAssert isOptionalNode() { + isInstanceOf(QuestionMarkNode.class); + return this; + } + + public NodeAssert isEmptyNode() { + isInstanceOf(EmptyNode.class); + return this; + } + + public SingleCharNodeAssert isSingleCharNode() { + isInstanceOf(SingleCharNode.class); + return new SingleCharNodeAssert((SingleCharNode) actual); + } + + public CharRangeNodeAssert isCharRangeNode() { + isInstanceOf(CharRangeNode.class); + return new CharRangeNodeAssert((CharRangeNode) actual); + } + + public IntegerNodeAssert isIntegerNode() { + isInstanceOf(IntegerNode.class); + return new IntegerNodeAssert((IntegerNode) actual); + } + + public IntegerRangeNodeAssert isIntegerRangeNode() { + isInstanceOf(IntegerRangeNode.class); + return new IntegerRangeNodeAssert((IntegerRangeNode) actual); + } + + public NodeAssert isStartAnchorNode() { + isInstanceOf(StartAnchorNode.class); + return this; + } + + public NodeAssert isEndAnchorNode() { + isInstanceOf(EndAnchorNode.class); + return this; + } + + public EscapedSingleCharNodeAssert isEscapedSingleCharNode() { + isInstanceOf(EscapedSingleCharNode.class); + return new EscapedSingleCharNodeAssert((EscapedSingleCharNode) actual); + } + + public NodeAssert isEncodedNumberNode() { + isInstanceOf(EncodedNumberNode.class); + return this; + } + + public NodeAssert isEncodedPatternNode() { + isInstanceOf(EncodedPatternNode.class); + return this; + } + + public AbstractStringAssert asTreeString() { + isNotNull(); + return Assertions.assertThat(StringVisitor.toString(actual)); + } + + public static class SingleCharNodeAssert extends NodeAssert { + + protected SingleCharNodeAssert(SingleCharNode node) { + super(node, SingleCharNodeAssert.class); + } + + public SingleCharNodeAssert hasCharacter(char expected) { + isNotNull(); + char actualChar = actual.getCharacter(); + if (!Objects.equals(actualChar, expected)) { + failWithMessage("Expected character to be %s but was %s", expected, actualChar); + } + return this; + } + } + + public static class EscapedSingleCharNodeAssert extends NodeAssert { + + protected EscapedSingleCharNodeAssert(EscapedSingleCharNode node) { + super(node, EscapedSingleCharNodeAssert.class); + } + + public EscapedSingleCharNodeAssert hasCharacter(char expected) { + isNotNull(); + char actualChar = actual.getCharacter(); + if (!Objects.equals(actualChar, expected)) { + failWithMessage("Expected character to be %s but was %s", expected, actualChar); + } + return this; + } + } + + public static class CharRangeNodeAssert extends NodeAssert { + + protected CharRangeNodeAssert(CharRangeNode node) { + super(node, CharRangeNodeAssert.class); + } + + public CharRangeNodeAssert hasStart(char expected) { + isNotNull(); + char actualStart = actual.getStart(); + if (!Objects.equals(expected, actualStart)) { + failWithMessage("Expected start to be %s but was %s", expected, actualStart); + } + return this; + } + + public CharRangeNodeAssert hasEnd(char expected) { + isNotNull(); + char actualEnd = actual.getEnd(); + if (!Objects.equals(expected, actualEnd)) { + failWithMessage("Expected end to be %s but was %s", expected, actualEnd); + } + return this; + } + } + + public static class CharClassNodeAssert extends NodeAssert { + + protected CharClassNodeAssert(CharClassNode node) { + super(node, CharClassNodeAssert.class); + } + + public CharClassNodeAssert isNegated() { + isNotNull(); + if (!actual.isNegated()) { + failWithMessage("Expected character class to be negated, but was not"); + } + return this; + } + + public CharClassNodeAssert isNotNegated() { + isNotNull(); + if (actual.isNegated()) { + failWithMessage("Expected character class to not be negated, but was"); + } + return this; + } + } + + public static class IntegerNodeAssert extends NodeAssert { + + protected IntegerNodeAssert(IntegerNode node) { + super(node, IntegerNodeAssert.class); + } + + public IntegerNodeAssert hasValue(int expected) { + isNotNull(); + int actualValue = actual.getValue(); + if (actualValue != expected) { + failWithMessage("Expected value to be %d but was %d", expected, actualValue); + } + return this; + } + } + + public static class IntegerRangeNodeAssert extends NodeAssert { + + protected IntegerRangeNodeAssert(IntegerRangeNode node) { + super(node, IntegerRangeNodeAssert.class); + } + + public IntegerRangeNodeAssert hasStart(Integer expected) { + isNotNull(); + Integer actualStart = actual.getStart(); + if (!Objects.equals(expected, actualStart)) { + failWithMessage("Expected start to be %d but was %d", expected, actualStart); + } + return this; + } + + public IntegerRangeNodeAssert hasEnd(Integer expected) { + isNotNull(); + Integer actualEnd = actual.getEnd(); + if (!Objects.equals(expected, actualEnd)) { + failWithMessage("Expected end to be %d but was %d", expected, actualEnd); + } + return this; + } + + public IntegerRangeNodeAssert hasBoundedEnd() { + isNotNull(); + if (!actual.isEndBounded()) { + failWithMessage("Expected end to be bounded"); + } + return this; + } + + public IntegerRangeNodeAssert hasUnboundedEnd() { + isNotNull(); + if (actual.isEndBounded()) { + failWithMessage("Expected end to be unbounded but was %d", actual.getEnd()); + } + return this; + } + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/NumericRegexEncoderTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/NumericRegexEncoderTest.java new file mode 100644 index 00000000000..052ae2c3865 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/NumericRegexEncoderTest.java @@ -0,0 +1,607 @@ +package datawave.data.normalizer.regex; + +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import datawave.data.type.util.NumericalEncoder; + +class NumericRegexEncoderTest { + + private static final List letters = new ArrayList<>(); + + @BeforeAll + static void beforeAll() { + letters.addAll(generateLetters('a', 'z')); + letters.addAll(generateLetters('A', 'Z')); + } + + /** + * Return an unmodifiable list of letters in order from the given starting letter to the given ending letter. + * + * @param start + * the starting letter + * @param end + * the ending letter + * @return a list of letters + */ + private static List generateLetters(char start, char end) { + // @formatter:off + return IntStream.rangeClosed(start, end) + .mapToObj(c -> "" + (char) c).collect(Collectors.toUnmodifiableList()); + // @formatter:on + } + + /** + * Verify that an exception is thrown for a blank regex pattern. + */ + @Test + void testEmptyRegex() { + assertExceptionThrown("", "Regex pattern may not be blank."); + } + + /** + * Verify that an exception is thrown for any regex with whitespace. + */ + @Test + void testRegexWithWhitespace() { + assertExceptionThrown(" 123 ", "Regex pattern may not contain any whitespace."); + assertExceptionThrown("123| 234", "Regex pattern may not contain any whitespace."); + } + + /** + * Verify that an exception is thrown for any regex that cannot be compiled. + */ + @Test + void testNonCompilablePatterns() { + // Empty character class. + assertExceptionThrown("123[]", "Regex pattern will not compile."); + // Empty negated character class. + assertExceptionThrown("123[^]", "Regex pattern will not compile."); + // Trailing backslash. + assertExceptionThrown("123\\", "Regex pattern will not compile."); + // Leading optional. + assertExceptionThrown("?234", "Regex pattern will not compile."); + // Repetition with undefined start. + assertExceptionThrown("3{,3}", "Regex pattern will not compile."); + } + + /** + * Verify that exceptions are thrown for nonsensical patterns that, while compilable, are certainly not valid numeric regexes. + */ + @Test + void testNonsensePatterns() { + assertExceptionThrown("\\.", "A nonsense pattern has been given that cannot be normalized."); + assertExceptionThrown("\\-", "A nonsense pattern has been given that cannot be normalized."); + assertExceptionThrown("-?", "A nonsense pattern has been given that cannot be normalized."); + assertExceptionThrown("^$", "A nonsense pattern has been given that cannot be normalized."); + assertExceptionThrown("^", "A nonsense pattern has been given that cannot be normalized."); + assertExceptionThrown("$", "A nonsense pattern has been given that cannot be normalized."); + assertExceptionThrown("(\\.)+", "A nonsense pattern has been given that cannot be normalized."); + assertExceptionThrown("\\.*", "A nonsense pattern has been given that cannot be normalized."); + assertExceptionThrown("-{3}", "A nonsense pattern has been given that cannot be normalized."); + assertExceptionThrown("()|()", "A nonsense pattern has been given that cannot be normalized."); + assertExceptionThrown("\\.|-", "A nonsense pattern has been given that cannot be normalized."); + } + + /** + * Verify that an exception is thrown for any regex that contains a letter other than the special case of \d. + */ + @Test + void testRegexWithRestrictedLetters() { + // Verify an exception is thrown for any non-escaped letter. + for (String letter : letters) { + assertExceptionThrown(letter, "Regex pattern may not contain any letters other than \\d to indicate a member of the digit character class 0-9."); + } + + // Verify an exception is thrown for '\D'. + assertExceptionThrown("\\D", "Regex pattern may not contain any letters other than \\d to indicate a member of the digit character class 0-9."); + } + + /** + * Verify that an exception is thrown for any escaped character that is not \- \. or \d. + */ + @Test + void testRegexWithRestrictedEscapedCharacters() { + // Verify no exception thrown for \d, \-, \., or an escaped number. + NumericRegexEncoder.encode("\\d"); + NumericRegexEncoder.encode("\\.3"); + NumericRegexEncoder.encode("\\-4"); + + // Verify exceptions are thrown for other characters. + assertExceptionThrown("\\\\", "Regex pattern may not contain any escaped characters other than \\. \\- or \\d."); + assertExceptionThrown("\\(", "Regex pattern may not contain any escaped characters other than \\. \\- or \\d."); + assertExceptionThrown("\\)", "Regex pattern may not contain any escaped characters other than \\. \\- or \\d."); + assertExceptionThrown("1\\?", "Regex pattern may not contain any escaped characters other than \\. \\- or \\d."); + assertExceptionThrown("\\[", "Regex pattern may not contain any escaped characters other than \\. \\- or \\d."); + assertExceptionThrown("\\]", "Regex pattern may not contain any escaped characters other than \\. \\- or \\d."); + assertExceptionThrown("1\\|", "Regex pattern may not contain any escaped characters other than \\. \\- or \\d."); + assertExceptionThrown("1\\+", "Regex pattern may not contain any escaped characters other than \\. \\- or \\d."); + assertExceptionThrown("1\\*", "Regex pattern may not contain any escaped characters other than \\. \\- or \\d."); + assertExceptionThrown("1\\^", "Regex pattern may not contain any escaped characters other than \\. \\- or \\d."); + assertExceptionThrown("1\\$", "Regex pattern may not contain any escaped characters other than \\. \\- or \\d."); + } + + /** + * Verify that an exception is thrown for regexes with groups. + */ + @Test + void testRegexWithGroups() { + assertExceptionThrown("(234)*", "Regex pattern may not contain any groups."); + } + + /** + * Verify that an exception is thrown for regexes with invalid character classes. + */ + @Test + void testRegexWithInvalidCharacterClasses() { + assertExceptionThrown("[+!]", "Character classes may only contain numeric characters and numeric ranges."); + } + + /** + * Verify that invalid decimal points are not allowed. + */ + @Test + void testInvalidDecimalPoints() { + // Verify quantifiers and optionals result in exceptions. + assertExceptionThrown("234\\.?34", "Regex pattern may not contain any decimal points that are directly followed by * ? or {}."); + assertExceptionThrown("234\\.*34", "Regex pattern may not contain any decimal points that are directly followed by * ? or {}."); + assertExceptionThrown("234\\.+34", "Regex pattern may not contain any decimal points that are directly followed by * ? or {}."); + assertExceptionThrown("234\\.{3}34", "Regex pattern may not contain any decimal points that are directly followed by * ? or {}."); + + // Verify multiple required decimal points result in exceptions. + assertExceptionThrown("3\\.34\\.3", "Regex may not contain expressions with than one decimal point."); + assertExceptionThrown("543.*|3\\.34\\.3", "Regex may not contain expressions with than one decimal point."); + } + + /** + * Verify that patterns that are ultimately empty after trimming all zero-length repetitions are not allowed. + */ + @Test + void testRegexConsistingOfZeroLengthRepetition() { + assertExceptionThrown("3{0}?", "Regex pattern is empty after trimming all characters followed by {0} or {0,0}."); + assertExceptionThrown("3{0,0}", "Regex pattern is empty after trimming all characters followed by {0} or {0,0}."); + assertExceptionThrown("3{0,0}?|[4-6]{0}", "Regex pattern is empty after trimming all characters followed by {0} or {0,0}."); + } + + /** + * Test that regexes not requiring encoding are not modified. + */ + @Test + void testRegexesThatDoNotRequireEncoding() { + assertRegex(".*").normalizesTo(".*"); + assertRegex("^.*$").normalizesTo("^.*$"); + assertRegex(".*?").normalizesTo(".*?"); + assertRegex("^.*?$").normalizesTo("^.*?$"); + assertRegex(".+").normalizesTo(".+"); + assertRegex("^.+$").normalizesTo("^.+$"); + assertRegex(".+?").normalizesTo(".+?"); + assertRegex("^.+?$").normalizesTo("^.+?$"); + assertRegex(".*.+.*?.+?").normalizesTo(".*.+.*?.+?"); + assertRegex("^.*.+.*?.+?$").normalizesTo("^.*.+.*?.+?$"); + } + + /** + * Test parsing regexes that consists of simple numbers. + */ + @Test + void testSimpleNumbers() { + // @formatter:off + // Single simple numbers. + assertRegex("123").normalizesTo("\\+cE1\\.23") + .matches("123"); + assertRegex("-32").normalizesTo("!YE6\\.8") + .matches("-32"); + assertRegex("983749587983487998734\\.34534").normalizesTo("\\+uE9\\.8374958798348799873434534") + .matches("983749587983487998734.34534"); + assertRegex("9983495030984594\\.54332").normalizesTo("\\+pE9\\.98349503098459454332") + .matches("9983495030984594.54332"); + assertRegex("-8889793487598488893485793").normalizesTo("!BE1\\.110206512401511106514207") + .matches("-8889793487598488893485793"); + + // Verify escaped hyphens are supported. + assertRegex("\\-32").normalizesTo("!YE6\\.8") + .matches("-32"); + + // Verify anchors are trimmed. + assertRegex("^123").normalizesTo("\\+cE1\\.23") + .matches("123"); + assertRegex("123$").normalizesTo("\\+cE1\\.23") + .matches("123"); + assertRegex("^123$").normalizesTo("\\+cE1\\.23") + .matches("123"); + + // Verify no issues with combining anchors and escaped hyphens. + assertRegex("^\\-123\\.234$").normalizesTo("!XE8\\.76766") + .matches("-123.234"); + + // Verify no issues with alternated simple numbers. + assertRegex("12|-45|23\\.45").normalizesTo("\\+bE1\\.2|!YE5\\.5|\\+bE2\\.345") + .matchesAllOf("12", "-45", "23.45"); + assertRegex("^12|-45|23\\.45$").normalizesTo("\\+bE1\\.2|!YE5\\.5|\\+bE2\\.345") + .matchesAllOf("12", "-45", "23.45"); + // @formatter:on + } + + @Test + void testDigitCharacterClass() { + // @formatter:off + assertRegex("\\d").normalizesTo("\\+aE\\d|\\+AE0") + .matchesAllOf("0", "1", "2", "3", "4", "5", "6", "7", "8", "9") + .matchesNoneOf("11", "34", "454"); + // @formatter:on + } + + @Test + void testCharacterClasses() { + // @formatter:off + // Test regexes made only of character classes. + assertRegex("[12][45][78]").normalizesTo("\\+cE[12]\\.[45][78]") + .matchesAllOf("147", "148", "157", "158", "247", "248", "257", "258") + .matchesNoneOf("14.7", "14.8", "1.57", "1.58", "27.7", "25.7", "258.1"); + + // Test character classes with a defined decimal point. + assertRegex("[12][45]\\.[78]").normalizesTo("\\+bE[12]\\.[45][78]") + .matchesAllOf("14.7", "14.8", "15.7", "15.8", "24.7", "24.8", "25.7", "25.8") + .matchesNoneOf("147", "148", "157", "158", "247", "248", "257", "258"); + + // Test character classes combined with numbers. + assertRegex("12[6-8]").normalizesTo("\\+cE1\\.2[6-8]") + .matchesAllOf("126", "127", "128") + .matchesNoneOf("125", "129", "12.6", "12.7", "128.4"); + + assertRegex("1\\.2[6-8]").normalizesTo("\\+aE1\\.2[6-8]") + .matchesAllOf("1.26", "1.27", "1.28") + .matchesNoneOf("1.25", "1.29"); + + assertRegex("[6-8]12").normalizesTo("\\+cE[6-8]\\.12") + .matchesAllOf("612", "712", "812") + .matchesNoneOf("512", "912"); + + assertRegex("[6-8]1\\.2").normalizesTo("\\+bE[6-8]\\.12") + .matchesAllOf("61.2", "71.2", "81.2") + .matchesNoneOf("51.2", "91.2"); + // @formatter:on + } + + @Test + void testCharacterClassesContainingPossibleZeroes() { + // Special case where a regex contains character classes that can match numbers equal to or greater than one, or zero. In this case, the zero must be + // put into an alternation and removed from the character class to ensure correct matching. + // @formatter:off + assertRegex("[0-9]").normalizesTo("\\+aE[0-9]|\\+AE0").matchesAllOf("0", "1", "2", "3", "4", "5", "6", "7", "8", "9"); + + assertRegex("[01234566789]").normalizesTo("\\+aE[01234566789]|\\+AE0") + .matchesAllOf("0", "1", "2", "3", "4", "5", "6", "7", "8", "9"); + // @formatter:on + } + + @Test + void testTrailingWildcard() { + // @formatter:off + assertRegex("111.").normalizesTo("\\+[c-d]E1\\.11.?") + .matchesAllOf("1110", "1111", "1113", "1114", "1115", "1116", "1117", "1118", "1119") + .matchesNoneOf("11145", "111.45"); + // @formatter:on + } + + @Test + void testTrailingWildcardZeroOrMore() { + // Test .* and .*? at end of regex for positive number. + // @formatter:off + assertRegex("111.*").normalizesTo("\\+[c-z]E1\\.11.*") + .matchesAllOf("111", "111445", "111.4325", "11153453.234") + .matchesNoneOf("1.11", "11.1", "0.43111"); + + + assertRegex("111.*?").normalizesTo("\\+[c-z]E1\\.11.*?") + .matchesAllOf("111", "111445", "111.4325", "11153453.234") + .matchesNoneOf("1.11", "11.1", "0.43111"); + // @formatter:on + + // Test .* and .*? at end of regex for negative number. + // @formatter:off + assertRegex("-111.*").normalizesTo("![A-X]E8\\.8(9|8.+)") + .matchesAllOf("-111", "-111.0", "-111.1", "-111.2", "-111.3", "-111.4", "-111.5", "-111.6", "-111.7", "-111.8", "-111.9") + .matchesAllOf("-1110", "-1111", "-1112", "-1113", "-1114", "-1115", "-1116", "-1117", "-1118", "-1119") + .matchesAllOf("-1114353454", "-1110.09203498", "-111090820394802933.234") + .matchesNoneOf("-110", "-1.11", "-11.1", "-121"); + + assertRegex("-111.*?").normalizesTo("![A-X]E8\\.8(9|8.+?)") + .matchesAllOf("-1112", "-111.454", "-111111232", "-111") + .matchesNoneOf("-11", "-113544"); + // @formatter:on + } + + @Test + void testTrailingWildcardOneOrMore() { + // Test .+ and .+? at end of regex for positive number. + // @formatter:off + assertRegex("111.+").normalizesTo("\\+[c-z]E1\\.11.*") + .matchesAllOf("1111", "111.0", "111445", "111.4325", "11153453.234") + .matchesNoneOf("1.11", "11.1", "0.43111"); + + assertRegex("111.+?").normalizesTo("\\+[c-z]E1\\.11.*?") + .matchesAllOf("111.0", "111445", "111.4325", "11153453.234") + .matchesNoneOf("1.11", "11.1", "0.43111"); + + // Test .+ and .+? at end of regex for negative number. + assertRegex("-111.+").normalizesTo("![A-X]E8\\.8(9|8.+)") + .matchesAllOf("-111.0", "-111.1", "-111.2", "-111.3", "-111.4", "-111.5", "-111.6", "-111.7", "-111.8", "-111.9") + .matchesAllOf("-1110", "-1111", "-1112", "-1113", "-1114", "-1115", "-1116", "-1117", "-1118", "-1119") + .matchesAllOf("-1114353454", "-1110.09203498", "-111090820394802933.234") + .matchesNoneOf("-110", "-1.11", "-11.1", "-121"); + + assertRegex("-111.+?").normalizesTo("![A-X]E8\\.8(9|8.+?)") + .matchesAllOf("-111.0", "-111.1", "-111.2", "-111.3", "-111.4", "-111.5", "-111.6", "-111.7", "-111.8", "-111.9") + .matchesAllOf("-1110", "-1111", "-1112", "-1113", "-1114", "-1115", "-1116", "-1117", "-1118", "-1119") + .matchesAllOf("-1114353454", "-1110.09203498", "-111090820394802933.234") + .matchesNoneOf("-110", "-1.11", "-11.1", "-121"); + // @formatter:on + } + + @Test + void testLeadingZeroOrMoreQuantifier() { + // @formatter:off + // Test .* at start of regex. The .* can remain a .* after the decimal point since it is a zero or more match. + assertRegex(".*54").normalizesTo("\\+[b-zA-Z]E.*5\\.?4|![A-Ya-z]E.*4\\.?6") + .matchesAllOf("154", "6644444444444444.54", "54", "-154", "-54", "-3566666666654", "0.00054", "-0.42222254") + .matchesNoneOf("111143"); + + // Test .*?. + assertRegex(".*?54").normalizesTo("\\+[b-zA-Z]E.*?5\\.?4|![A-Ya-z]E.*?4\\.?6") + .matchesAllOf("154", "6644444444444444.54", "54", "-154", "-54", "-3566666666654", "0.00054", "-0.42222254") + .matchesNoneOf("111143"); + + // Test .* at start of regex. The .* can remain a .* after the decimal point since it is a zero or more match. + assertRegex(".*\\.54").normalizesTo("\\+[a-zZ]E.*5\\.?4|![A-Za]E.*4\\.?6") + .matchesAllOf("0.54", "6644444444444444.54", "-1.54", "-.54", "-35666666666.54") + .matchesNoneOf("111143", "0.00054"); + + // Test .*?. + assertRegex(".*?\\.54").normalizesTo("\\+[a-zZ]E.*?5\\.?4|![A-Za]E.*?4\\.?6") + .matchesAllOf("1.54", "6644444444444444.54", "-.54", "-1.54") + .matchesNoneOf("111143"); + + assertRegex("\\..*54").normalizesTo("\\+[A-Z]E.*5\\.?4") + .matchesAllOf(".154", ".054", ".54", ".3566666666654", ".00054", ".42222254") + .matchesNoneOf("111143", "6644444444444444.54", "1.54", "154", "-.154"); + + // Test .*?. + assertRegex("\\..*?54").normalizesTo("\\+[A-Z]E.*?5\\.?4") + .matchesAllOf(".154", ".054", ".54", ".3566666666654", ".00054", ".42222254") + .matchesNoneOf("111143", "6644444444444444.54", "1.54", "154", "-.154"); + // @formatter:on + } + + @Test + void testLeadingOneOrMoreQuantifier() { + // @formatter:off + // Test .+ at start of regex. The .+ should become a .* after the decimal point since we have one wildcard guaranteed before the decimal point. + assertRegex(".+54").normalizesTo("\\+[b-zA-Z]E.*5\\.?4|![A-Ya-z]E.*4\\.?6") + .matchesAllOf("154", "6644444444444444.54", "-154", "-444444444444454", "0.54", "054") + .matchesNoneOf("5.4", "542343"); + + // Test .+?. + assertRegex(".+?54").normalizesTo("\\+[b-zA-Z]E.*?5\\.?4|![A-Ya-z]E.*?4\\.?6") + .matchesAllOf("154", "6644444444444444.54", "-154", "-222222222222254", "-054") + .matchesNoneOf("5.4", "542343"); + + assertRegex(".+\\.54").normalizesTo("\\+[a-zZ]E.*5\\.?4|![A-Za]E.*4\\.?6") + .matchesAllOf("1.54", "6644444444444444.54", "-1.54", "-4444444444444.54", "0.54") + .matchesNoneOf("542343", ".544453"); + + // Test .+?. + assertRegex(".+?\\.54").normalizesTo("\\+[a-zZ]E.*?5\\.?4|![A-Za]E.*?4\\.?6") + .matchesAllOf("1.54", "6644444444444444.54", "-1.54", "-2222222222222.54") + .matchesNoneOf("542343", ".0000054"); + + assertRegex("\\..+54").normalizesTo("\\+[A-Z]E.*5\\.?4") + .matchesAllOf(".154", ".664444444444444454", ".0000000054", ".054") + .matchesNoneOf("54", "5.4", "542343", "-.154", "154", "-.000054"); + + // Test .+?. + assertRegex("\\..+?54").normalizesTo("\\+[A-Z]E.*?5\\.?4") + .matchesAllOf(".154", ".664444444444444454", ".0000000054", ".054") + .matchesNoneOf("54", "5.4", "542343", "-.154", "154", "-.000054"); + // @formatter:on + } + + @Test + void testLeadingOneOrMoreQuantifierForWildcard() { + // @formatter:off + // Test .+ at start of regex. The .+ should become a .* after the decimal point since we have one wildcard guaranteed before the decimal point. + assertRegex(".{3}54").normalizesTo("\\+[b-eW-Z]E.?\\.?.{0,2}5\\.?4|![V-Ya-d]E.?\\.?.{0,2}4\\.?6") + .matchesAllOf("11154", "43.54", "-1154", "-4454", "00054", "-0054") + .matchesNoneOf("5.4", "542343"); + + // Test .+?. + assertRegex(".{3}?54").normalizesTo("\\+[b-eW-Z]E.?\\.?.{0,2}?5\\.?4|![V-Ya-d]E.?\\.?.{0,2}?4\\.?6") + .matchesAllOf("00154", "-1054", "00054", "99954", ".0054") + .matchesNoneOf("5.4", "542343", "6644444444444444.54", "-222222222222254"); + + assertRegex(".{3}\\.54").normalizesTo("\\+[a-cZ]E.?\\.?.{0,2}5\\.?4|![X-Za]E.?\\.?.{0,2}4\\.?6") + .matchesAllOf("111.54", "444.54", "000.54", "-00.54", "-46.54") + .matchesNoneOf("542343", "343.4554", "0.000054"); + + // Test .+?. + assertRegex(".{3}?\\.54").normalizesTo("\\+[a-cZ]E.?\\.?.{0,2}?5\\.?4|![X-Za]E.?\\.?.{0,2}?4\\.?6") + .matchesAllOf("111.54", "444.54", "000.54", "-00.54", "-46.54") + .matchesNoneOf("542343", "343.4554", "0.000054"); + + assertRegex("\\..{3}54").normalizesTo("\\+[W-Z]E.?\\.?.{0,2}5\\.?4") + .matchesAllOf(".00154", ".34354", ".99954") + .matchesNoneOf("54", "5.4", "542343", "-.00154"); + + // Test .+?. + assertRegex("\\..{3}?54").normalizesTo("\\+[W-Z]E.?\\.?.{0,2}?5\\.?4") + .matchesAllOf(".00154", ".34354", ".99954") + .matchesNoneOf("54", "5.4", "542343", "-.00154"); + // @formatter:on + } + + @Test + void testLeadingRepetitionQuantifier() { + // @formatter:off + // Test {3} at start of regex. The {3} should become a {2} after the decimal point since we will have an occurrence '1' exactly specified in the regex + // before the decimal point. + assertRegex("1{3}4").normalizesTo("\\+dE1\\.1{2}4") + .matches("1114") + .matchesNoneOf("1.4", "114", "11114", "111.4"); + + // Test {2} at start of regex. The repetition quantifier should be removed entirely after the decimal point since we have the two occurrences of '1' + // exactly specified in the regex. + assertRegex("1{2}4").normalizesTo("\\+cE1\\.14") + .matches("114") + .matchesNoneOf("1.4", "14", "11.4", "1114"); + + // Test {3,} at start of regex. The {3,} should become a {2,} after the decimal point since we will have an occurrence of '1' exactly specified in the + // regex before the decimal point. + assertRegex("1{3,}4").normalizesTo("\\+[d-z]E1\\.1{2,}4") + .matchesAllOf("1114", "1111111114", "11111111111111111114") + .matchesNoneOf("1.4", "14", "114", "124", "111.4"); + + // Test {2,} at start of regex. The {2,} should become a + since we only need to require an occurrence of '1' one or more times after the decimal point. + assertRegex("1{2,}4").normalizesTo("\\+[c-z]E1\\.1+4") + .matchesAllOf("114", "11111114", "1111111111111111114") + .matchesNoneOf("1.4", "4", "14", "24", "11.4"); + + // Test {1} at start of regex. The {1} can be removed entirely. + assertRegex("1{1}4").normalizesTo("\\+bE1\\.4") + .matches("14") + .matchesNoneOf("1", "1.4", "4", "114", "1111111111111114", "124"); + + // Test {1,} at start of regex. The {1,} should become a * since an occurrence of '1' can happen zero or more times after the decimal point. + assertRegex("1{1,}4").normalizesTo("\\+[b-z]E1\\.?1*4") + .matchesAllOf("14", "1111111114", "11111111111111111114") + .matchesNoneOf("4", "1.4", "104"); + + // Test {1,2} at start of regex. The {1,2} should become an ? after the decimal point. + assertRegex("1{1,2}4").normalizesTo("\\+[b-c]E1\\.?1{0,1}4") + .matchesAllOf("14", "114") + .matchesNoneOf("1", "4", "1.4", "104", "1114"); + + // Test {1,2} at start of regex. The {1,2} should become a {1,2} after the decimal point. + assertRegex("1{2,3}4").normalizesTo("\\+[c-d]E1\\.1{1,2}4") + .matchesAllOf("114", "1114") + .matchesNoneOf("1.4", "14", "11114", "104", "11.4"); + + // Test {1,2} at start of regex. The {1,2} should become a {4,13} after the decimal point. + assertRegex("1{5,14}4").normalizesTo("\\+[f-o]E1\\.1{4,13}4") + .matchesAllOf("111114", "11111111114", "111111111111114") + .matchesNoneOf("11111.4", "11114", "1111111111111114"); + + // Test {0,} at the start of the regex. The {0,} is equivalent to * and can remain the same after the decimal point. + assertRegex("1{0,}4").normalizesTo("\\+[a-z]E1?\\.?1*4") + .matchesAllOf("4", "14", "11111111111114", "111111111111111114") + .matchesNoneOf("1", "104"); + + // Test {0,5} at the start of the regex. The {0,5} should become {0,4} after the decimal point. + assertRegex("1{0,5}4").normalizesTo("\\+[a-f]E1?\\.?1{0,4}4") + .matchesAllOf("4", "14", "114", "1114", "11114", "111114") + .matchesNoneOf("1", "1111114"); + // @formatter:on + } + + @Test + void testSubOneRegexesWithDecimalPoints() { + // @formatter:off + assertRegex("0\\.5.*").normalizesTo("\\+ZE5\\.?.*") + .matchesAllOf("0.5", "0.545984") + .matchesNoneOf("0.6", "0.05"); + + assertRegex("0\\.005.*").normalizesTo("\\+XE5\\.?.*") + .matchesAllOf("0.005", "0.0059834795") + .matchesNoneOf("0.05", "0.006"); + + assertRegex("0\\.0000000000000[3-7]45\\d").normalizesTo("\\+ME[3-7]\\.45\\d?") + .matchesAllOf("0.00000000000003450", "0.00000000000005453", "0.00000000000007459") + .matchesNoneOf("0.000345", "0.00000000000001450"); + // @formatter:on + } + + @Test + void testSingleTrailingZeroOrMoreAfterDecimalPoint() { + // @formatter:off + assertRegex("0\\.5.*").normalizesTo("\\+ZE5\\.?.*") + .matchesAllOf("0.5", "0.545984") + .matchesNoneOf("0.6", "0.05"); + // @formatter:on + } + + @Test + void testComplexNegativePattern() { + // @formatter:off + assertRegex("-34\\d.{0,4}4*").normalizesTo("![A-X]E6\\.(6|5\\d|5\\d.{0,4}|5\\d.{0,4}5*6)") + .matchesAllOf("-340", "-341", "-342", "-343", "-344", "-345", "-346", "-347", "-348", "-349") // Test matching -34\d portion + .matchesNoneOf("340", "341", "342", "343", "344", "345", "346", "347", "348", "349") // Ensure does not match positive variant + .matchesAllOf("-3411", "-34112", "-341123", "-3411234", "-3419876") // Test matching -34\d.{0,4} portion + .matchesAllOf("-3419586444444", "-34195864444444444444") // Test matching -34\d.{0,4}4* portion + .matchesNoneOf("-3414321999999"); // Ensure does not match numbers not ending with 4* + // @formatter:on + } + + private void assertExceptionThrown(String pattern, String message) { + assertThatThrownBy(() -> NumericRegexEncoder.encode(pattern)).hasMessage(message); + } + + private RegexAssert assertRegex(String regex) { + return new RegexAssert(regex); + } + + private static class RegexAssert { + private final Pattern original; + private Pattern normalized; + + private RegexAssert(String pattern) { + this.original = Pattern.compile(pattern); + } + + public RegexAssert normalizesTo(String expected) { + String actual = NumericRegexEncoder.encode(original.toString()); + assertThat(actual).as("Check normalizing %s", original).isEqualTo(expected); + normalized = Pattern.compile(actual); + return this; + } + + public RegexAssert matches(String number) { + assertMatchStatus(number, true); + return this; + } + + public RegexAssert matchesAllOf(String... numbers) { + for (String number : numbers) { + matches(number); + } + return this; + } + + public RegexAssert doesNotMatch(String number) { + assertMatchStatus(number, false); + return this; + } + + public RegexAssert matchesNoneOf(String... numbers) { + for (String number : numbers) { + doesNotMatch(number); + } + return this; + } + + private void assertMatchStatus(String number, boolean match) { + String matchStatus = match ? " matches " : " does not match "; + assertThat(original.matcher(number).matches()).as("Assert " + original + matchStatus + number).isEqualTo(match); + String encodedNumber = NumericalEncoder.encode(number); + assertThat(normalized.matcher(encodedNumber).matches()).as("Assert " + normalized + matchStatus + encodedNumber + " (" + number + ")") + .isEqualTo(match); + } + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/RegexParserTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/RegexParserTest.java new file mode 100644 index 00000000000..71d674575c3 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/RegexParserTest.java @@ -0,0 +1,306 @@ +package datawave.data.normalizer.regex; + +import static datawave.data.normalizer.regex.NodeAssert.assertThat; +import static datawave.data.normalizer.regex.RegexParser.parse; + +import org.junit.jupiter.api.Test; + +class RegexParserTest { + + @Test + void testParsingEmptyString() { + // @formatter:off + assertThat(parse("")).isExpressionNode().hasChildCount(1) + .assertChild(0).isEmptyNode().hasNoChildren(); + // @formatter:on + } + + @Test + void testParsingNumbers() { + // Test parsing a whole number. + // @formatter:off + assertThat(parse("345")).isExpressionNode().hasChildCount(3) + .assertChild(0).isSingleCharNode().hasCharacter('3').hasNoChildren().assertParent() + .assertChild(1).isSingleCharNode().hasCharacter('4').hasNoChildren().assertParent() + .assertChild(2).isSingleCharNode().hasCharacter('5').hasNoChildren(); + // @formatter:on + + // Test parsing a floating point number. + // @formatter:off + assertThat(parse("23\\.5")).isExpressionNode().hasChildCount(4) + .assertChild(0).isSingleCharNode().hasCharacter('2').hasNoChildren().assertParent() + .assertChild(1).isSingleCharNode().hasCharacter('3').hasNoChildren().assertParent() + .assertChild(2).isEscapedSingleCharNode().hasCharacter('.').hasNoChildren().assertParent() + .assertChild(3).isSingleCharNode().hasCharacter('5').hasNoChildren(); + // @formatter:on + + // Test parsing a negative floating point number. + // @formatter:off + assertThat(parse("-12\\.5")).isExpressionNode().hasChildCount(5) + .assertChild(0).isSingleCharNode().hasCharacter('-').hasNoChildren().assertParent() + .assertChild(1).isSingleCharNode().hasCharacter('1').hasNoChildren().assertParent() + .assertChild(2).isSingleCharNode().hasCharacter('2').hasNoChildren().assertParent() + .assertChild(3).isEscapedSingleCharNode().hasCharacter('.').hasNoChildren().assertParent() + .assertChild(4).isSingleCharNode().hasCharacter('5').hasNoChildren(); + // @formatter:on + } + + @Test + void testParsingAlternations() { + // Test parsing simple top-level alternations. + // @formatter:off + assertThat(parse("24|4|5")).isExpressionNode().hasChildCount(1) + .assertChild(0).isAlternationNode().hasChildCount(3) + .assertChild(0).isExpressionNode().hasChildCount(2) + .assertChild(0).isSingleCharNode().hasCharacter('2').hasNoChildren().assertParent() + .assertChild(1).isSingleCharNode().hasCharacter('4').hasNoChildren().assertGrandparent() + .assertChild(1).isExpressionNode().hasChildCount(1) + .assertChild(0).isSingleCharNode().hasCharacter('4').hasNoChildren().assertGrandparent() + .assertChild(2).isExpressionNode().hasChildCount(1) + .assertChild(0).isSingleCharNode().hasCharacter('5').hasNoChildren(); + // @formatter:on + + // Test parsing only empty alternations. + // @formatter:off + assertThat(parse("||")).isExpressionNode().hasChildCount(1) + .assertChild(0).isAlternationNode().hasChildCount(3) + .assertChild(0).isEmptyNode().hasNoChildren().assertParent() + .assertChild(1).isEmptyNode().hasNoChildren().assertParent() + .assertChild(2).isEmptyNode().hasNoChildren(); + // @formatter:on + + // Test parsing trailing empty alternation. + // @formatter:off + assertThat(parse("3|")).isExpressionNode().hasChildCount(1) + .assertChild(0).isAlternationNode().hasChildCount(2) + .assertChild(0).isExpressionNode().hasChildCount(1) + .assertChild(0).isSingleCharNode().hasCharacter('3').assertGrandparent() + .assertChild(1).isEmptyNode().hasNoChildren(); + // @formatter:on + } + + @Test + void testParsingCharacterClasses() { + // Test parsing a digit character class. + // @formatter:off + assertThat(parse("\\d")).isExpressionNode().hasChildCount(1) + .assertChild(0).isDigitCharClassNode().hasNoChildren(); + // @formatter:on + + // Test parsing a character class with digits. + // @formatter:off + assertThat(parse("[458]")).isExpressionNode().hasChildCount(1) + .assertChild(0).isCharClassNode().isNotNegated().hasChildCount(3) + .assertChild(0).isSingleCharNode().hasCharacter('4').hasNoChildren().assertParent() + .assertChild(1).isSingleCharNode().hasCharacter('5').hasNoChildren().assertParent() + .assertChild(2).isSingleCharNode().hasCharacter('8').hasNoChildren(); + // @formatter:on + + // Test parsing a negated character class. + // @formatter:off + assertThat(parse("[^458]")).isExpressionNode().hasChildCount(1) + .assertChild(0).isCharClassNode().isNegated().hasChildCount(3) + .assertChild(0).isSingleCharNode().hasCharacter('4').hasNoChildren().assertParent() + .assertChild(1).isSingleCharNode().hasCharacter('5').hasNoChildren().assertParent() + .assertChild(2).isSingleCharNode().hasCharacter('8').hasNoChildren(); + // @formatter:on + + // Test parsing a character class with a negative sign at the beginning. + // @formatter:off + assertThat(parse("[\\-58]")).isExpressionNode().hasChildCount(1) + .assertChild(0).isCharClassNode().isNotNegated().hasChildCount(3) + .assertChild(0).isEscapedSingleCharNode().hasCharacter('-').hasNoChildren().assertParent() + .assertChild(1).isSingleCharNode().hasCharacter('5').hasNoChildren().assertParent() + .assertChild(2).isSingleCharNode().hasCharacter('8').hasNoChildren(); + // @formatter:on + + // Test parsing a character class with a negative sign at the end. + // @formatter:off + assertThat(parse("[58\\-]")).isExpressionNode().hasChildCount(1) + .assertChild(0).isCharClassNode().isNotNegated().hasChildCount(3) + .assertChild(0).isSingleCharNode().hasCharacter('5').hasNoChildren().assertParent() + .assertChild(1).isSingleCharNode().hasCharacter('8').hasNoChildren().assertParent() + .assertChild(2).isEscapedSingleCharNode().hasCharacter('-').hasNoChildren(); + // @formatter:on + + // Test parsing a character class with a range. + // @formatter:off + assertThat(parse("[5-8]")).isExpressionNode().hasChildCount(1) + .assertChild(0).isCharClassNode().isNotNegated().hasChildCount(1) + .assertChild(0).isCharRangeNode().hasStart('5').hasEnd('8').hasNoChildren(); + // @formatter:on + + // Test parsing a character class with a range and a subsequent digit. + // @formatter:off + assertThat(parse("[2-46]")).isExpressionNode().hasChildCount(1) + .assertChild(0).isCharClassNode().isNotNegated().hasChildCount(2) + .assertChild(0).isCharRangeNode().hasStart('2').hasEnd('4').hasNoChildren().assertParent() + .assertChild(1).isSingleCharNode().hasCharacter('6').hasNoChildren(); + // @formatter:on + + // Test parsing a character class with multiple ranges. + // @formatter:off + assertThat(parse("[2-46-8]")).isExpressionNode().hasChildCount(1) + .assertChild(0).isCharClassNode().isNotNegated().hasChildCount(2) + .assertChild(0).isCharRangeNode().hasStart('2').hasEnd('4').hasNoChildren().assertParent() + .assertChild(1).isCharRangeNode().hasStart('6').hasEnd('8').hasNoChildren(); + // @formatter:on + + // Test parsing a character class with a decimal point. + // @formatter:off + assertThat(parse("[.58]")).isExpressionNode().hasChildCount(1) + .assertChild(0).isCharClassNode().isNotNegated().hasChildCount(3) + .assertChild(0).isSingleCharNode().hasCharacter('.').hasNoChildren().assertParent() + .assertChild(1).isSingleCharNode().hasCharacter('5').hasNoChildren().assertParent() + .assertChild(2).isSingleCharNode().hasCharacter('8').hasNoChildren(); + // @formatter:on + + // Test parsing character classes with escaped characters. + assertThat(parse("[\\^\\\\]")).isExpressionNode().hasChildCount(1).assertChild(0).isCharClassNode().isNotNegated().hasChildCount(2).assertChild(0) + .isEscapedSingleCharNode().hasCharacter('^').hasNoChildren().assertParent().assertChild(1).isEscapedSingleCharNode().hasCharacter('\\') + .hasNoChildren(); + + // Test parsing character classes with letters. + assertThat(parse("[a-zA-Z]")).isExpressionNode().hasChildCount(1).assertChild(0).isCharClassNode().isNotNegated().hasChildCount(2).assertChild(0) + .isCharRangeNode().hasStart('a').hasEnd('z').hasNoChildren().assertParent().assertChild(1).isCharRangeNode().hasStart('A').hasEnd('Z') + .hasNoChildren(); + + // Test parsing character classes with non-alphanumeric characters. + assertThat(parse("[!+]")).isExpressionNode().hasChildCount(1).assertChild(0).isCharClassNode().isNotNegated().hasChildCount(2).assertChild(0) + .isSingleCharNode().hasCharacter('!').hasNoChildren().assertParent().assertChild(1).isSingleCharNode().hasCharacter('+') + .hasNoChildren(); + } + + @Test + void testParsingRepetition() { + // Test parsing a non-ranged repetition. + // @formatter:off + assertThat(parse("{3}")).isExpressionNode().hasChildCount(1) + .assertChild(0).isRepetitionNode().hasChildCount(1) + .assertChild(0).isIntegerNode().hasValue(3).hasNoChildren(); + // @formatter:on + + // Test parsing a ranged repetition with a bounded start and end. + // @formatter:off + assertThat(parse("{3,6}")).isExpressionNode().hasChildCount(1) + .assertChild(0).isRepetitionNode().hasChildCount(1) + .assertChild(0).isIntegerRangeNode().hasStart(3).hasEnd(6).hasNoChildren(); + // @formatter:on + + // Test parsing a ranged repetition with an unbounded end. + // @formatter:off + assertThat(parse("{3,}")).isExpressionNode().hasChildCount(1) + .assertChild(0).isRepetitionNode().hasChildCount(1) + .assertChild(0).isIntegerRangeNode().hasStart(3).hasUnboundedEnd().hasNoChildren(); + // @formatter:on + + // Test parsing multi-digit repetitions. + // @formatter:off + assertThat(parse("{344}")).isExpressionNode().hasChildCount(1) + .assertChild(0).isRepetitionNode().hasChildCount(1) + .assertChild(0).isIntegerNode().hasValue(344); + + assertThat(parse("{344,665}")).isExpressionNode().hasChildCount(1) + .assertChild(0).isRepetitionNode().hasChildCount(1) + .assertChild(0).isIntegerRangeNode().hasStart(344).hasEnd(665); + // @formatter:on + } + + @Test + void testParsingGroups() { + // Test parsing a simple group. + // @formatter:off + assertThat(parse("(123)")).isExpressionNode().hasChildCount(1) + .assertChild(0).isGroupNode().hasChildCount(1) + .assertChild(0).isExpressionNode().hasChildCount(3) + .assertChild(0).isSingleCharNode().hasCharacter('1').hasNoChildren().assertParent() + .assertChild(1).isSingleCharNode().hasCharacter('2').hasNoChildren().assertParent() + .assertChild(2).isSingleCharNode().hasCharacter('3').hasNoChildren(); + // @formatter:on + + // Test parsing a group with alternations. + // @formatter:off + assertThat(parse("(12|3)")).isExpressionNode().hasChildCount(1) + .assertChild(0).isGroupNode().hasChildCount(1) + .assertChild(0).isAlternationNode().hasChildCount(2) + .assertChild(0).isExpressionNode().hasChildCount(2) + .assertChild(0).isSingleCharNode().hasCharacter('1').hasNoChildren().assertParent() + .assertChild(1).isSingleCharNode().hasCharacter('2').hasNoChildren().assertGrandparent() + .assertChild(1).isExpressionNode().hasChildCount(1) + .assertChild(0).isSingleCharNode().hasCharacter('3').hasNoChildren(); + // @formatter:on + + // Test parsing empty group. + // @formatter:off + assertThat(parse("()")).isExpressionNode().hasChildCount(1) + .assertChild(0).isGroupNode().hasChildCount(1) + .assertChild(0).isEmptyNode(); + // @formatter:on + + // Test nested groups. + // @formatter:off + assertThat(parse("(1|(5|4))")).isExpressionNode().hasChildCount(1) + .assertChild(0).isGroupNode().hasChildCount(1) + .assertChild(0).isAlternationNode().hasChildCount(2) + .assertChild(0).isExpressionNode().hasChildCount(1) + .assertChild(0).isSingleCharNode().hasCharacter('1').hasNoChildren().assertGrandparent() + .assertChild(1).isGroupNode().hasChildCount(1) + .assertChild(0).isAlternationNode().hasChildCount(2) + .assertChild(0).isExpressionNode().hasChildCount(1) + .assertChild(0).isSingleCharNode().hasCharacter('5').hasNoChildren().assertGrandparent() + .assertChild(1).isExpressionNode().hasChildCount(1) + .assertChild(0).isSingleCharNode().hasCharacter('4').hasNoChildren(); + // @formatter:on + } + + @Test + void testParsingDot() { + // @formatter:off + assertThat(parse("23.")).isExpressionNode().hasChildCount(3) + .assertChild(0).isSingleCharNode().hasCharacter('2').hasNoChildren().assertParent() + .assertChild(1).isSingleCharNode().hasCharacter('3').hasNoChildren().assertParent() + .assertChild(2).isAnyCharNode().hasNoChildren(); + // @formatter:on + } + + @Test + void testParsingStar() { + // @formatter:off + assertThat(parse("23*")).isExpressionNode().hasChildCount(3) + .assertChild(0).isSingleCharNode().hasCharacter('2').hasNoChildren().assertParent() + .assertChild(1).isSingleCharNode().hasCharacter('3').hasNoChildren().assertParent() + .assertChild(2).isZeroToManyNode().hasNoChildren(); + // @formatter:on + } + + @Test + void testParsingPlus() { + // @formatter:off + assertThat(parse("23+")).isExpressionNode().hasChildCount(3) + .assertChild(0).isSingleCharNode().hasCharacter('2').hasNoChildren().assertParent() + .assertChild(1).isSingleCharNode().hasCharacter('3').hasNoChildren().assertParent() + .assertChild(2).isOneToManyNode().hasNoChildren(); + // @formatter:on + } + + @Test + void testParsingQuestionMark() { + // @formatter:off + assertThat(parse("23?")).isExpressionNode().hasChildCount(3) + .assertChild(0).isSingleCharNode().hasCharacter('2').hasNoChildren().assertParent() + .assertChild(1).isSingleCharNode().hasCharacter('3').hasNoChildren().assertParent() + .assertChild(2).isOptionalNode().hasNoChildren(); + // @formatter:on + } + + @Test + void testParsingAnchors() { + // @formatter:off + assertThat(parse("^12$")).isExpressionNode().hasChildCount(4) + .assertChild(0).isStartAnchorNode().hasNoChildren().assertParent() + .assertChild(1).isSingleCharNode().hasCharacter('1').hasNoChildren().assertParent() + .assertChild(2).isSingleCharNode().hasCharacter('2').hasNoChildren().assertParent() + .assertChild(3).isEndAnchorNode().hasNoChildren(); + // @formatter:on + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/RegexUtilsTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/RegexUtilsTest.java new file mode 100644 index 00000000000..947079b8155 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/RegexUtilsTest.java @@ -0,0 +1,191 @@ +package datawave.data.normalizer.regex; + +import static datawave.data.normalizer.regex.RegexParser.parse; +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.commons.lang3.tuple.Pair; +import org.junit.jupiter.api.Test; + +class RegexUtilsTest { + + @Test + void testSplitOnAlternations() { + // Test empty string. + assertThat(RegexUtils.splitOnAlternations("")).containsExactly(""); + + // Test no alternations. + assertThat(RegexUtils.splitOnAlternations("123")).containsExactly("123"); + + // Test top-level alternations. + assertThat(RegexUtils.splitOnAlternations("12|(34[45])|56.*")).containsExactly("12", "(34[45])", "56.*"); + + // Test pipes within groups. + assertThat(RegexUtils.splitOnAlternations("(234|345)")).containsExactly("(234|345)"); + + // Test leading empty alternation. + assertThat(RegexUtils.splitOnAlternations("|34")).containsExactly("", "34"); + + // Test trailing empty alternation. + assertThat(RegexUtils.splitOnAlternations("34|")).containsExactly("34", ""); + + // Test inner alternations with no content. + assertThat(RegexUtils.splitOnAlternations("|34||35|")).containsExactly("", "34", "", "35", ""); + + // Test only empty alternations. + assertThat(RegexUtils.splitOnAlternations("||")).containsExactly("", "", ""); + + } + + @Test + void testIsNumber() { + assertThat(RegexUtils.isNumber("123")).isTrue(); + assertThat(RegexUtils.isNumber("-123")).isTrue(); + assertThat(RegexUtils.isNumber("12\\.3")).isTrue(); + assertThat(RegexUtils.isNumber("-12\\.3")).isTrue(); + + assertThat(RegexUtils.isNumber("-12.3")).isFalse(); + assertThat(RegexUtils.isNumber("345|54")).isFalse(); + assertThat(RegexUtils.isNumber("(123)")).isFalse(); + assertThat(RegexUtils.isNumber("[34]")).isFalse(); + assertThat(RegexUtils.isNumber("34*")).isFalse(); + assertThat(RegexUtils.isNumber("34+")).isFalse(); + assertThat(RegexUtils.isNumber("34?")).isFalse(); + } + + @Test + void testEncodeNumber() { + assertThat(RegexUtils.encodeNumber("123")).isEqualTo("\\+cE1\\.23"); + assertThat(RegexUtils.encodeNumber("123\\.4")).isEqualTo("\\+cE1\\.234"); + assertThat(RegexUtils.encodeNumber("-14")).isEqualTo("!YE8\\.6"); + assertThat(RegexUtils.encodeNumber("-1\\.4")).isEqualTo("!ZE8\\.6"); + assertThat(RegexUtils.encodeNumber("-1111111\\.3454")).isEqualTo("!TE8\\.8888886546"); + } + + @Test + void testIsChar() { + assertThat(RegexUtils.isChar(parse("0").getFirstChild(), '0')).isTrue(); + assertThat(RegexUtils.isChar(parse("0").getFirstChild(), '1')).isFalse(); + assertThat(RegexUtils.isChar(parse("\\.").getFirstChild(), '.')).isTrue(); + } + + @Test + void testContainsChar() { + assertThat(RegexUtils.charClassMatches(parse("[3-7]").getFirstChild(), '0')).isFalse(); + assertThat(RegexUtils.charClassMatches(parse("[3-7]").getFirstChild(), '1')).isFalse(); + assertThat(RegexUtils.charClassMatches(parse("[3-7]").getFirstChild(), '2')).isFalse(); + assertThat(RegexUtils.charClassMatches(parse("[3-7]").getFirstChild(), '8')).isFalse(); + assertThat(RegexUtils.charClassMatches(parse("[3-7]").getFirstChild(), '9')).isFalse(); + + assertThat(RegexUtils.charClassMatches(parse("[3-7]").getFirstChild(), '3')).isTrue(); + assertThat(RegexUtils.charClassMatches(parse("[3-7]").getFirstChild(), '4')).isTrue(); + assertThat(RegexUtils.charClassMatches(parse("[3-7]").getFirstChild(), '5')).isTrue(); + assertThat(RegexUtils.charClassMatches(parse("[3-7]").getFirstChild(), '6')).isTrue(); + assertThat(RegexUtils.charClassMatches(parse("[3-7]").getFirstChild(), '7')).isTrue(); + + assertThat(RegexUtils.charClassMatches(parse("[059]").getFirstChild(), '1')).isFalse(); + assertThat(RegexUtils.charClassMatches(parse("[059]").getFirstChild(), '2')).isFalse(); + assertThat(RegexUtils.charClassMatches(parse("[059]").getFirstChild(), '3')).isFalse(); + assertThat(RegexUtils.charClassMatches(parse("[059]").getFirstChild(), '4')).isFalse(); + assertThat(RegexUtils.charClassMatches(parse("[059]").getFirstChild(), '6')).isFalse(); + assertThat(RegexUtils.charClassMatches(parse("[059]").getFirstChild(), '7')).isFalse(); + assertThat(RegexUtils.charClassMatches(parse("[059]").getFirstChild(), '8')).isFalse(); + + assertThat(RegexUtils.charClassMatches(parse("[059]").getFirstChild(), '0')).isTrue(); + assertThat(RegexUtils.charClassMatches(parse("[059]").getFirstChild(), '5')).isTrue(); + assertThat(RegexUtils.charClassMatches(parse("[059]").getFirstChild(), '9')).isTrue(); + } + + @Test + void testContainsCharNegated() { + assertThat(RegexUtils.charClassMatches(parse("[^3-7]").getFirstChild(), '0')).isTrue(); + assertThat(RegexUtils.charClassMatches(parse("[^3-7]").getFirstChild(), '1')).isTrue(); + assertThat(RegexUtils.charClassMatches(parse("[^3-7]").getFirstChild(), '2')).isTrue(); + assertThat(RegexUtils.charClassMatches(parse("[^3-7]").getFirstChild(), '8')).isTrue(); + assertThat(RegexUtils.charClassMatches(parse("[^3-7]").getFirstChild(), '9')).isTrue(); + + assertThat(RegexUtils.charClassMatches(parse("[^3-7]").getFirstChild(), '3')).isFalse(); + assertThat(RegexUtils.charClassMatches(parse("[^3-7]").getFirstChild(), '4')).isFalse(); + assertThat(RegexUtils.charClassMatches(parse("[^3-7]").getFirstChild(), '5')).isFalse(); + assertThat(RegexUtils.charClassMatches(parse("[^3-7]").getFirstChild(), '6')).isFalse(); + assertThat(RegexUtils.charClassMatches(parse("[^3-7]").getFirstChild(), '7')).isFalse(); + + assertThat(RegexUtils.charClassMatches(parse("[^059]").getFirstChild(), '1')).isTrue(); + assertThat(RegexUtils.charClassMatches(parse("[^059]").getFirstChild(), '2')).isTrue(); + assertThat(RegexUtils.charClassMatches(parse("[^059]").getFirstChild(), '3')).isTrue(); + assertThat(RegexUtils.charClassMatches(parse("[^059]").getFirstChild(), '4')).isTrue(); + assertThat(RegexUtils.charClassMatches(parse("[^059]").getFirstChild(), '6')).isTrue(); + assertThat(RegexUtils.charClassMatches(parse("[^059]").getFirstChild(), '7')).isTrue(); + assertThat(RegexUtils.charClassMatches(parse("[^059]").getFirstChild(), '8')).isTrue(); + + assertThat(RegexUtils.charClassMatches(parse("[^059]").getFirstChild(), '0')).isFalse(); + assertThat(RegexUtils.charClassMatches(parse("[^059]").getFirstChild(), '5')).isFalse(); + assertThat(RegexUtils.charClassMatches(parse("[^059]").getFirstChild(), '9')).isFalse(); + } + + @Test + void testMatchesZero() { + // Test single char nodes. + assertThat(RegexUtils.matchesZero(new SingleCharNode('0'))).isTrue(); + assertThat(RegexUtils.matchesZero(new SingleCharNode('1'))).isFalse(); + assertThat(RegexUtils.matchesZero(new SingleCharNode('2'))).isFalse(); + assertThat(RegexUtils.matchesZero(new SingleCharNode('3'))).isFalse(); + assertThat(RegexUtils.matchesZero(new SingleCharNode('4'))).isFalse(); + assertThat(RegexUtils.matchesZero(new SingleCharNode('5'))).isFalse(); + assertThat(RegexUtils.matchesZero(new SingleCharNode('6'))).isFalse(); + assertThat(RegexUtils.matchesZero(new SingleCharNode('7'))).isFalse(); + assertThat(RegexUtils.matchesZero(new SingleCharNode('8'))).isFalse(); + assertThat(RegexUtils.matchesZero(new SingleCharNode('9'))).isFalse(); + + // Test wildcard. + assertThat(RegexUtils.matchesZero(new AnyCharNode())).isTrue(); + + // Test digit character class. + assertThat(RegexUtils.matchesZero(parse("\\d").getFirstChild())).isTrue(); + + // Test character classes. + assertThat(RegexUtils.matchesZero(parse("[0]").getFirstChild())).isTrue(); + assertThat(RegexUtils.matchesZero(parse("[0126-9]").getFirstChild())).isTrue(); + assertThat(RegexUtils.matchesZero(parse("[0-4]").getFirstChild())).isTrue(); + assertThat(RegexUtils.matchesZero(parse("[123456789]").getFirstChild())).isFalse(); + assertThat(RegexUtils.matchesZero(parse("[1-9]").getFirstChild())).isFalse(); + } + + @Test + void testMatchesZeroOnly() { + // Test single char nodes. + assertThat(RegexUtils.matchesZeroOnly(new SingleCharNode('0'))).isTrue(); + assertThat(RegexUtils.matchesZeroOnly(new SingleCharNode('1'))).isFalse(); + assertThat(RegexUtils.matchesZeroOnly(new SingleCharNode('2'))).isFalse(); + assertThat(RegexUtils.matchesZeroOnly(new SingleCharNode('3'))).isFalse(); + assertThat(RegexUtils.matchesZeroOnly(new SingleCharNode('4'))).isFalse(); + assertThat(RegexUtils.matchesZeroOnly(new SingleCharNode('5'))).isFalse(); + assertThat(RegexUtils.matchesZeroOnly(new SingleCharNode('6'))).isFalse(); + assertThat(RegexUtils.matchesZeroOnly(new SingleCharNode('7'))).isFalse(); + assertThat(RegexUtils.matchesZeroOnly(new SingleCharNode('8'))).isFalse(); + assertThat(RegexUtils.matchesZeroOnly(new SingleCharNode('9'))).isFalse(); + + // Test wildcard. + assertThat(RegexUtils.matchesZeroOnly(new AnyCharNode())).isFalse(); + + // Test digit character class. + assertThat(RegexUtils.matchesZeroOnly(parse("\\d").getFirstChild())).isFalse(); + + // Test character classes. + assertThat(RegexUtils.matchesZeroOnly(parse("[0]").getFirstChild())).isTrue(); + assertThat(RegexUtils.matchesZeroOnly(parse("[0-0]").getFirstChild())).isTrue(); + assertThat(RegexUtils.matchesZeroOnly(parse("[0126-9]").getFirstChild())).isFalse(); + assertThat(RegexUtils.matchesZeroOnly(parse("[0-4]").getFirstChild())).isFalse(); + assertThat(RegexUtils.matchesZeroOnly(parse("[123456789]").getFirstChild())).isFalse(); + assertThat(RegexUtils.matchesZeroOnly(parse("[1-9]").getFirstChild())).isFalse(); + assertThat(RegexUtils.matchesZeroOnly(parse("[0-9]").getFirstChild())).isFalse(); + } + + @Test + void testGetQuantifierRange() { + assertThat(RegexUtils.getQuantifierRange(new ZeroOrMoreNode())).isEqualTo(Pair.of(0, null)); + assertThat(RegexUtils.getQuantifierRange(new OneOrMoreNode())).isEqualTo(Pair.of(1, null)); + assertThat(RegexUtils.getQuantifierRange(parse("{2}").getFirstChild())).isEqualTo(Pair.of(2, 2)); + assertThat(RegexUtils.getQuantifierRange(parse("{2,5}").getFirstChild())).isEqualTo(Pair.of(2, 5)); + assertThat(RegexUtils.getQuantifierRange(parse("{2,}").getFirstChild())).isEqualTo(Pair.of(2, null)); + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/AlternationDeduperTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/AlternationDeduperTest.java new file mode 100644 index 00000000000..c82f5a5e9bb --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/AlternationDeduperTest.java @@ -0,0 +1,42 @@ +package datawave.data.normalizer.regex.visitor; + +import static datawave.data.normalizer.regex.NodeAssert.assertThat; +import static datawave.data.normalizer.regex.RegexParser.parse; + +import org.junit.jupiter.api.Test; + +import datawave.data.normalizer.regex.Node; + +class AlternationDeduperTest { + + @Test + void testPatternsWithoutAlternations() { + assertNotDeduped("0"); + assertNotDeduped("345.*"); + assertNotDeduped("(345.*)"); + assertNotDeduped("-653[3-5]"); + } + + @Test + void testAlternationsWithoutDuplications() { + assertNotDeduped("0|1|5|56"); + assertNotDeduped("45.*|76[3-6]|.*?343"); + } + + @Test + void testAlternationsWithDuplicates() { + assertDeduped("54|54", "54"); + assertDeduped("54|34.*|54", "54|34.*"); + assertDeduped("54|34.*|76.*34|34.*", "54|34.*|76.*34"); + } + + private void assertNotDeduped(String pattern) { + assertDeduped(pattern, pattern); + } + + private void assertDeduped(String pattern, String expectedPattern) { + Node actual = AlternationDeduper.dedupe(parse(pattern)); + Node expected = parse(expectedPattern); + assertThat(actual).isEqualTreeTo(expected); + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/AnchorTrimmerTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/AnchorTrimmerTest.java new file mode 100644 index 00000000000..deb1ec5fa26 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/AnchorTrimmerTest.java @@ -0,0 +1,52 @@ +package datawave.data.normalizer.regex.visitor; + +import static datawave.data.normalizer.regex.NodeAssert.assertThat; +import static datawave.data.normalizer.regex.RegexParser.parse; + +import org.junit.jupiter.api.Test; + +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.visitor.AnchorTrimmer; + +class AnchorTrimmerTest { + + @Test + void testNullNode() { + assertNotTrimmed(null); + } + + @Test + void testEmptyNode() { + assertNotTrimmed(""); + } + + @Test + void testRegexesWithoutAnchors() { + assertNotTrimmed("123.*"); + assertNotTrimmed("(234[0-9]|342).*"); + assertNotTrimmed(".*234\\d\\.3{3}"); + } + + @Test + void testRegexesWithAnchors() { + assertTrimmedTo("^123.*$", "123.*"); + assertTrimmedTo("^123.*$|^65[0-9]{3}$", "123.*|65[0-9]{3}"); + assertTrimmedTo("(^123.*$)|(^65[0-9]{3}$)", "(123.*)|(65[0-9]{3})"); + assertTrimmedTo("(^123.*|65[0-9]{3}$)", "(123.*|65[0-9]{3})"); + assertTrimmedTo("^123.*|65[0-9]{3}$", "123.*|65[0-9]{3}"); + } + + private void assertNotTrimmed(String pattern) { + assertTrimmedTo(pattern, pattern); + } + + private void assertTrimmedTo(String pattern, String expectedPattern) { + Node actual = AnchorTrimmer.trim(parse(pattern)); + if (expectedPattern == null) { + assertThat(actual).isNull(); + } else { + Node expected = parse(expectedPattern); + assertThat(actual).isEqualTreeTo(expected); + } + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/DecimalPointPlacerTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/DecimalPointPlacerTest.java new file mode 100644 index 00000000000..d70bfe3362a --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/DecimalPointPlacerTest.java @@ -0,0 +1,426 @@ +package datawave.data.normalizer.regex.visitor; + +import static datawave.data.normalizer.regex.NodeAssert.assertThat; +import static datawave.data.normalizer.regex.RegexParser.parse; + +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; + +import datawave.data.normalizer.regex.Node; + +class DecimalPointPlacerTest { + + @Nested + class PositiveVariants { + + @Test + void testSimpleNumbers() { + assertAdded("234", "\\+cE2\\.34"); + assertAdded("234|454", "\\+cE2\\.34|\\+cE4\\.54"); + } + + @Test + void testSingleLengthPatterns() { + assertAdded("[3-9]", "\\+aE[3-9]"); + assertAdded("\\d", "\\+aE\\d"); + assertAdded(".", "\\+aE."); + } + + @Test + void testLeadingMultiWildcards() { + assertAdded(".*", "\\+[a-zA-Z]E.*"); + assertAdded(".*?", "\\+[a-zA-Z]E.*?"); + assertAdded(".+", "\\+[a-zA-Z]E.+"); + assertAdded(".+?", "\\+[a-zA-Z]E.+?"); + + // In the case of .*, allow for a possible decimal point occurring after it, and after the next character. + assertAdded(".*454", "\\+[c-zA-Z]E.*4\\.?54"); + assertAdded(".*?45", "\\+[b-zA-Z]E.*?4\\.?5"); + + // In the case of a leading .+, allow for a possible decimal point occurring after it, and after the next character. We must account for when .+ + // might + // be a decimal point, such as for the number 0.343. + assertAdded(".+343", "\\+[c-zA-Z]E.*3\\.?43"); + assertAdded(".+?343", "\\+[c-zA-Z]E.*?3\\.?43"); + } + + @Test + void testConsolidatedZeros() { + assertAdded(".*000004", "\\+[a-zA-Z]E.*(0{5})?4"); + } + + @Test + void testLeadingQuantifiersForSingleChar() { + assertAdded("4*11", "\\+[b-z]E4?\\.?4*1\\.?1"); + assertAdded("4+11", "\\+[c-z]E4\\.?4*11"); + assertAdded("4{3}11", "\\+eE4\\.4{2}11"); + assertAdded("4{3,5}11", "\\+[e-g]E4\\.4{2,4}11"); + assertAdded("4{1,5}11", "\\+[c-g]E4\\.4{0,4}11"); + assertAdded("4{1,}11", "\\+[c-z]E4\\.?4*11"); + assertAdded("4{2,}11", "\\+[d-z]E4\\.4+11"); + assertAdded("4{1,2}11", "\\+[c-d]E4\\.4{0,1}11"); + assertAdded("4{1}11", "\\+cE4\\.11"); + assertAdded("4{2}11", "\\+dE4\\.411"); + assertAdded("4{0,5}11", "\\+[b-g]E4?\\.4{0,4}11"); + + assertAdded("4{1,5}.*", "\\+[a-z]E4\\.?4{0,4}.*"); + assertAdded("4{1,2}.*", "\\+[a-z]E4\\.?4{0,1}.*"); + assertAdded("4{0,5}.*", "\\+[a-z]E4?\\.?4{0,4}.*"); + + assertAdded("4{1,5}", "\\+[a-e]E4\\.?4{0,4}"); + assertAdded("4{1,2}", "\\+[a-b]E4\\.?4{0,1}"); + assertAdded("4{0,5}", "\\+[a-e]E4?\\.?4{0,4}"); + } + + @Test + void testLeadingQuantifiersForWildcard() { + assertAdded(".*11", "\\+[b-zA-Z]E.*1\\.?1"); + assertAdded(".+11", "\\+[b-zA-Z]E.*1\\.?1"); + assertAdded(".{3}11", "\\+[b-eW-Z]E.?\\.?.{0,2}1\\.?1"); + assertAdded(".{3,5}11", "\\+[b-gU-Z]E(.\\.?.{2,4})?1\\.?1"); + assertAdded(".{1,5}11", "\\+[b-gU-Z]E(.\\.?.{0,4})?1\\.?1"); + assertAdded(".{1,}11", "\\+[b-zA-Z]E(.\\.?.*)?1\\.?1"); + assertAdded(".{2,}11", "\\+[b-zA-Z]E(.\\.?.+)?1\\.?1"); + assertAdded(".{1,2}11", "\\+[b-dX-Z]E(.\\.?.{0,1})?1\\.?1"); + assertAdded(".{1}11", "\\+[b-cY-Z]E.?\\.?1\\.?1"); + assertAdded(".{2}11", "\\+[b-dX-Z]E.?\\.?.{0,1}1\\.?1"); + assertAdded(".{0,5}11", "\\+[b-gU-Z]E.?\\.?.{0,4}1\\.?1"); + + assertAdded(".{1,5}.*", "\\+[a-zA-Z]E(.\\.?.{0,4})?.*"); + assertAdded(".{1,2}.*", "\\+[a-zA-Z]E(.\\.?.{0,1})?.*"); + assertAdded(".{0,5}.*", "\\+[a-zA-Z]E.?\\.?.{0,4}.*"); + + assertAdded(".{1,5}", "\\+[a-eU-Z]E.\\.?.{0,4}"); + assertAdded(".{1,2}", "\\+[a-bX-Z]E.\\.?.{0,1}"); + assertAdded(".{0,5}", "\\+[a-eU-Z]E.?\\.?.{0,4}"); + } + + @Test + void testLeadingQuantifiersForDigitCharClass() { + assertAdded("\\d*11", "\\+[b-z]E\\d?\\.?\\d*1\\.?1"); + assertAdded("\\d+11", "\\+[b-z]E\\d?\\.?\\d*1\\.?1"); + assertAdded("\\d{3}11", "\\+[b-e]E\\d?\\.?\\d{0,2}1\\.?1"); + assertAdded("\\d{3,5}11", "\\+[b-g]E(\\d\\.?\\d{2,4})?1\\.?1"); + assertAdded("\\d{1,5}11", "\\+[b-g]E(\\d\\.?\\d{0,4})?1\\.?1"); + assertAdded("\\d{1,}11", "\\+[b-z]E(\\d\\.?\\d*)?1\\.?1"); + assertAdded("\\d{2,}11", "\\+[b-z]E(\\d\\.?\\d+)?1\\.?1"); + assertAdded("\\d{1,2}11", "\\+[b-d]E(\\d\\.?\\d{0,1})?1\\.?1"); + assertAdded("\\d{1}11", "\\+[b-c]E\\d?\\.?1\\.?1"); + assertAdded("\\d{2}11", "\\+[b-d]E\\d?\\.?\\d{0,1}1\\.?1"); + assertAdded("\\d{0,5}11", "\\+[b-g]E\\d?\\.?\\d{0,4}1\\.?1"); + + assertAdded("\\d{1,5}.*", "\\+[a-zA-Z]E(\\d\\.?\\d{0,4})?.*"); + assertAdded("\\d{1,2}.*", "\\+[a-zA-Z]E(\\d\\.?\\d{0,1})?.*"); + assertAdded("\\d{0,5}.*", "\\+[a-zA-Z]E\\d?\\.?\\d{0,4}.*"); + + assertAdded("\\d{1,5}", "\\+[a-e]E\\d\\.?\\d{0,4}"); + assertAdded("\\d{1,2}", "\\+[a-b]E\\d\\.?\\d{0,1}"); + assertAdded("\\d{0,5}", "\\+[a-e]E\\d?\\.?\\d{0,4}"); + } + + @Test + void testLeadingQuantifiersForCharClassContainingZero() { + assertAdded("[012]*11", "\\+[b-z]E[012]?\\.?[012]*1\\.?1"); + assertAdded("[012]+11", "\\+[b-z]E[012]?\\.?[012]*1\\.?1"); + assertAdded("[012]{3}11", "\\+[b-e]E[012]?\\.?[012]{0,2}1\\.?1"); + assertAdded("[012]{3,5}11", "\\+[b-g]E([012]\\.?[012]{2,4})?1\\.?1"); + assertAdded("[012]{1,5}11", "\\+[b-g]E([012]\\.?[012]{0,4})?1\\.?1"); + assertAdded("[012]{1,}11", "\\+[b-z]E([012]\\.?[012]*)?1\\.?1"); + assertAdded("[012]{2,}11", "\\+[b-z]E([012]\\.?[012]+)?1\\.?1"); + assertAdded("[012]{1,2}11", "\\+[b-d]E([012]\\.?[012]{0,1})?1\\.?1"); + assertAdded("[012]{1}11", "\\+[b-c]E[012]?\\.?1\\.?1"); + assertAdded("[012]{2}11", "\\+[b-d]E[012]?\\.?[012]{0,1}1\\.?1"); + assertAdded("[012]{0,5}11", "\\+[b-g]E[012]?\\.?[012]{0,4}1\\.?1"); + + assertAdded("[012]{1,5}.*", "\\+[a-zA-Z]E([012]\\.?[012]{0,4})?.*"); + assertAdded("[012]{1,2}.*", "\\+[a-zA-Z]E([012]\\.?[012]{0,1})?.*"); + assertAdded("[012]{0,5}.*", "\\+[a-zA-Z]E[012]?\\.?[012]{0,4}.*"); + + assertAdded("[012]{1,5}", "\\+[a-e]E[012]\\.?[012]{0,4}"); + assertAdded("[012]{1,2}", "\\+[a-b]E[012]\\.?[012]{0,1}"); + assertAdded("[012]{0,5}", "\\+[a-e]E[012]?\\.?[012]{0,4}"); + } + + @Test + void testLeadingQuantifiersForCharClassNotContainingZero() { + assertAdded("[24]*11", "\\+[b-z]E[24]?\\.?[24]*1\\.?1"); + assertAdded("[24]+11", "\\+[c-z]E[24]\\.?[24]*11"); + assertAdded("[24]{3}11", "\\+eE[24]\\.[24]{2}11"); + assertAdded("[24]{3,5}11", "\\+[e-g]E[24]\\.[24]{2,4}11"); + assertAdded("[24]{1,5}11", "\\+[c-g]E[24]\\.[24]{0,4}11"); + assertAdded("[24]{1,}11", "\\+[c-z]E[24]\\.?[24]*11"); + assertAdded("[24]{2,}11", "\\+[d-z]E[24]\\.[24]+11"); + assertAdded("[24]{1,2}11", "\\+[c-d]E[24]\\.[24]{0,1}11"); + assertAdded("[24]{1}11", "\\+cE[24]\\.11"); + assertAdded("[24]{2}11", "\\+dE[24]\\.[24]11"); + assertAdded("[24]{0,5}11", "\\+[b-g]E[24]?\\.[24]{0,4}11"); + + assertAdded("[24]{1,5}.*", "\\+[a-z]E[24]\\.?[24]{0,4}.*"); + assertAdded("[24]{1,2}.*", "\\+[a-z]E[24]\\.?[24]{0,1}.*"); + assertAdded("[24]{0,5}.*", "\\+[a-z]E[24]?\\.?[24]{0,4}.*"); + + assertAdded("[24]{1,5}", "\\+[a-e]E[24]\\.?[24]{0,4}"); + assertAdded("[24]{1,2}", "\\+[a-b]E[24]\\.?[24]{0,1}"); + assertAdded("[24]{0,5}", "\\+[a-e]E[24]?\\.?[24]{0,4}"); + } + + /** + * Test patterns that have multiple possible leading zero elements that must all be made optional. + */ + @Test + void testMultiplePossibleLeadingZeros() { + assertAdded("[30].\\d", "\\+[a-cY-Z]E[30]?\\.?.?\\.?\\d?"); + assertAdded("[30]\\d..*", "\\+[a-zA-Z]E[30]?\\.?\\d?\\.?.?\\.?.*"); + assertAdded(".*54", "\\+[b-zA-Z]E.*5\\.?4"); + assertAdded(".+54", "\\+[b-zA-Z]E.*5\\.?4"); + assertAdded("[04]{3}[05]{2}[06]", "\\+[a-f]E[04]?\\.?[04]{0,2}[05]?\\.?[05]{0,1}[06]?"); + assertAdded("[04]{3}[05]{2}[06].*.+43", "\\+[b-zA-Z]E[04]?\\.?[04]{0,2}[05]?\\.?[05]{0,1}[06]?\\.?.*.*4\\.?3"); + assertAdded("[04]{3}0000[05]{2}[06].*.+43", "\\+[b-zA-Z]E[04]?\\.?[04]{0,2}(0{4})?[05]?\\.?[05]{0,1}[06]?\\.?.*.*4\\.?3"); + assertAdded("[04]{3}0000[05]{2}[06][08]{3,5}.*.+43", + "\\+[b-zA-Z]E[04]?\\.?[04]{0,2}(0{4})?[05]?\\.?[05]{0,1}[06]?\\.?([08]\\.?[08]{2,4})?.*.*4\\.?3"); + } + + /** + * Test patterns similar to those in {@link #testMultiplePossibleLeadingZeros()}, but with a non-leading zero at the beginning, and verify that none of + * the elements after the non-leading zero are made optional. + */ + @Test + void testMultipleNonLeadingZeros() { + assertAdded("3[30]\\d..*", "\\+[c-z]E3\\.?[30]?\\d?.?.*"); + assertAdded("3.*54", "\\+[a-z]E3\\..*54"); + assertAdded("3.+54", "\\+[a-z]E3\\..+54"); + assertAdded("3[04]{3}[05]{2}[06]", "\\+gE3\\.?[04]{0,3}[05]{0,2}[06]?"); + assertAdded("3[04]{3}[05]{2}[06].*.+43", "\\+[g-z]E3\\.[04]{3}[05]{2}[06].*.+43"); + assertAdded("3[04]{3}0000[05]{2}[06].*.+43", "\\+[k-z]E3\\.[04]{3}0000[05]{2}[06].*.+43"); + assertAdded("3[04]{3}0000[05]{2}[06][08]{3,5}.*.+43", "\\+[n-z]E3\\.[04]{3}0000[05]{2}[06][08]{3,5}.*.+43"); + } + + /** + * Test patterns similar to those in {@link #testMultiplePossibleLeadingZeros()}, but with a non-leading zero somewhere in the middle. Verify that any + * possible zeros before the first non-leading zeros are made optional, but any succeeding possible zeros are not made optional. + */ + @Test + void testMixedLeadingAndNonLeadingZeros() { + assertAdded("[30]\\d..*34[05]{2}[04]", "\\+[e-zA-Z]E[30]?\\.?\\d?\\.?.?\\.?.*3\\.?4[05]{0,2}[04]?"); + assertAdded(".*[05]{2}5[05]{2}4", "\\+[d-zA-Z]E.*[05]?\\.?[05]{0,1}5\\.?[05]{2}4"); + assertAdded(".+[05]{2}54[05]{2}", "\\+[d-zA-Z]E.*[05]?\\.?[05]{0,1}5\\.?4[05]{0,2}"); + assertAdded("[04]{3}[05]{2}33[06][05]{2}", "\\+[e-j]E[04]?\\.?[04]{0,2}[05]?\\.?[05]{0,1}3\\.?3[06]?[05]{0,2}"); + assertAdded("[04]{3}[05]{2}33[06][05]{2}.*.+", "\\+[e-z]E[04]?\\.?[04]{0,2}[05]?\\.?[05]{0,1}3\\.?3[06]?[05]{0,2}.*.*"); + assertAdded("[04]{3}0000[05]{2}33[06].*.+", "\\+[c-z]E[04]?\\.?[04]{0,2}(0{4})?[05]?\\.?[05]{0,1}3\\.?3[06]?.*.*"); + assertAdded("[04]{3}0000[05]{2}33[06][08]{3,5}.*.+", "\\+[f-z]E[04]?\\.?[04]{0,2}(0{4})?[05]?\\.?[05]{0,1}3\\.?3[06]?([08]{3,5})?.*.*"); + } + } + + @Nested + class NegativeVariants { + + @Test + void testSimpleNumbers() { + assertAdded("-234", "!XE7\\.66"); + assertAdded("-234|-454", "!XE7\\.66|!XE5\\.46"); + } + + @Test + void testSingleLengthPatterns() { + assertAdded("-[3-9]", "!ZE[1-7]"); + assertAdded("-\\d", "!ZE\\d"); + assertAdded("-.", "!ZE."); + } + + @Test + void testLeadingMultiWildcards() { + assertAdded("-.*", "![A-Za-z]E.+"); + assertAdded("-.*?", "![A-Za-z]E.+?"); + assertAdded("-.+", "![A-Za-z]E.+"); + assertAdded("-.+?", "![A-Za-z]E.+?"); + + // In the case of .*, allow for a possible decimal point occurring after it, and after the next character. + assertAdded("-.*454", "![A-Xa-z]E.*5\\.?46"); + assertAdded("-.*?45", "![A-Ya-z]E.*?5\\.?5"); + + // In the case of a leading .+, allow for a possible decimal point occurring after it, and after the next character. We must account for when .+ + // might + // be a decimal point, such as for the number 0.343. + assertAdded("-.+343", "![A-Xa-z]E.*6\\.?57"); + assertAdded("-.+?343", "![A-Xa-z]E.*?6\\.?57"); + } + + @Test + void testConsolidatedZeros() { + assertAdded("-.*000004", "![A-Za-z]E.*(9{5})?6"); + } + + @Test + void testLeadingQuantifiersForSingleChar() { + assertAdded("-4*11", "![A-Y]E5?\\.?5*8\\.?9"); + assertAdded("-4+11", "![A-X]E5\\.?5*89"); + assertAdded("-4{3}11", "!VE5\\.5{2}89"); + assertAdded("-4{3,5}11", "![T-V]E5\\.5{2,4}89"); + assertAdded("-4{1,5}11", "![T-X]E5\\.5{0,4}89"); + assertAdded("-4{1,}11", "![A-X]E5\\.?5*89"); + assertAdded("-4{2,}11", "![A-W]E5\\.5+89"); + assertAdded("-4{1,2}11", "![W-X]E5\\.5{0,1}89"); + assertAdded("-4{1}11", "!XE5\\.89"); + assertAdded("-4{2}11", "!WE5\\.589"); + assertAdded("-4{0,5}11", "![T-Y]E5?\\.5{0,4}89"); + + assertAdded("-4{1,5}.*", "![A-Z]E(5?\\.?5{0,3}6|5\\.?5{0,4}.+)"); + assertAdded("-4{3,5}.*", "![A-X]E(5\\.5{1,3}6|5\\.5{2,4}.+)"); + assertAdded("-4{1,2}.*", "![A-Z]E(5?\\.?6|5\\.?5{0,1}.+)"); + assertAdded("-4{0,5}.*", "![A-Z]E(5?\\.?5{0,3}6|5?\\.?5{0,4}.+)"); + + assertAdded("-4{1,5}", "![V-Z]E5?\\.?5{0,3}6"); + assertAdded("-4{1,2}", "![Y-Z]E5?\\.?6"); + assertAdded("-4{0,5}", "![V-Z]E5?\\.?5{0,3}6"); + } + + @Test + void testLeadingQuantifiersForWildcard() { + assertAdded("-.*11", "![A-Ya-z]E.*8\\.?9"); + assertAdded("-.+11", "![A-Ya-z]E.*8\\.?9"); + assertAdded("-.{3}11", "![V-Ya-d]E.?\\.?.{0,2}8\\.?9"); + assertAdded("-.{3,5}11", "![T-Ya-f]E(.\\.?.{2,4})?8\\.?9"); + assertAdded("-.{1,5}11", "![T-Ya-f]E(.\\.?.{0,4})?8\\.?9"); + assertAdded("-.{1,}11", "![A-Ya-z]E(.\\.?.*)?8\\.?9"); + assertAdded("-.{2,}11", "![A-Ya-z]E(.\\.?.+)?8\\.?9"); + assertAdded("-.{1,2}11", "![W-Ya-c]E(.\\.?.{0,1})?8\\.?9"); + assertAdded("-.{1}11", "![X-Ya-b]E.?\\.?8\\.?9"); + assertAdded("-.{2}11", "![W-Ya-c]E.?\\.?.{0,1}8\\.?9"); + assertAdded("-.{0,5}11", "![T-Ya-f]E.?\\.?.{0,4}8\\.?9"); + + assertAdded("-.{1,5}.*", "![A-Za-z]E(.\\.?.{0,4}|.\\.?.{0,4}.\\.?.*)"); + assertAdded("-.{1,2}.*", "![A-Za-z]E(.\\.?.{0,1}|.\\.?.{0,1}.\\.?.*)"); + assertAdded("-.{0,5}.*", "![A-Za-z]E(.?\\.?.{0,4}|.?\\.?.{0,4}.\\.?.*)"); + + assertAdded("-.{1,5}", "![V-Za-f]E.\\.?.{0,4}"); + assertAdded("-.{1,2}", "![Y-Za-c]E.\\.?.{0,1}"); + assertAdded("-.{0,5}", "![V-Za-f]E.?\\.?.{0,4}"); + } + + @Test + void testLeadingQuantifiersForDigitCharClass() { + assertAdded("-\\d*11", "![A-Y]E\\d?\\.?\\d*8\\.?9"); + assertAdded("-\\d+11", "![A-Y]E\\d?\\.?\\d*8\\.?9"); + assertAdded("-\\d{3}11", "![V-Y]E\\d?\\.?\\d{0,2}8\\.?9"); + assertAdded("-\\d{3,5}11", "![T-Y]E(\\d\\.?\\d{2,4})?8\\.?9"); + assertAdded("-\\d{1,5}11", "![T-Y]E(\\d\\.?\\d{0,4})?8\\.?9"); + assertAdded("-\\d{1,}11", "![A-Y]E(\\d\\.?\\d*)?8\\.?9"); + assertAdded("-\\d{2,}11", "![A-Y]E(\\d\\.?\\d+)?8\\.?9"); + assertAdded("-\\d{1,2}11", "![W-Y]E(\\d\\.?\\d{0,1})?8\\.?9"); + assertAdded("-\\d{1}11", "![X-Y]E\\d?\\.?8\\.?9"); + assertAdded("-\\d{2}11", "![W-Y]E\\d?\\.?\\d{0,1}8\\.?9"); + assertAdded("-\\d{0,5}11", "![T-Y]E\\d?\\.?\\d{0,4}8\\.?9"); + + assertAdded("-\\d{1,5}.*", "![A-Za-z]E(\\d\\.?\\d{0,4}|\\d\\.?\\d{0,4}.\\.?.*)"); + assertAdded("-\\d{1,2}.*", "![A-Za-z]E(\\d\\.?\\d{0,1}|\\d\\.?\\d{0,1}.\\.?.*)"); + assertAdded("-\\d{0,5}.*", "![A-Za-z]E(\\d?\\.?\\d{0,4}|\\d?\\.?\\d{0,4}.\\.?.*)"); + + assertAdded("-\\d{1,5}", "![V-Z]E\\d\\.?\\d{0,4}"); + assertAdded("-\\d{1,2}", "![Y-Z]E\\d\\.?\\d{0,1}"); + assertAdded("-\\d{0,5}", "![V-Z]E\\d?\\.?\\d{0,4}"); + } + + @Test + void testLeadingQuantifiersForCharClassContainingZero() { + assertAdded("-[012]*11", "![A-Y]E[987]?\\.?[987]*8\\.?9"); + assertAdded("-[012]+11", "![A-Y]E[987]?\\.?[987]*8\\.?9"); + assertAdded("-[012]{3}11", "![V-Y]E[987]?\\.?[987]{0,2}8\\.?9"); + assertAdded("-[012]{3,5}11", "![T-Y]E([987]\\.?[987]{2,4})?8\\.?9"); + assertAdded("-[012]{1,5}11", "![T-Y]E([987]\\.?[987]{0,4})?8\\.?9"); + assertAdded("-[012]{1,}11", "![A-Y]E([987]\\.?[987]*)?8\\.?9"); + assertAdded("-[012]{2,}11", "![A-Y]E([987]\\.?[987]+)?8\\.?9"); + assertAdded("-[012]{1,2}11", "![W-Y]E([987]\\.?[987]{0,1})?8\\.?9"); + assertAdded("-[012]{1}11", "![X-Y]E[987]?\\.?8\\.?9"); + assertAdded("-[012]{2}11", "![W-Y]E[987]?\\.?[987]{0,1}8\\.?9"); + assertAdded("-[012]{0,5}11", "![T-Y]E[987]?\\.?[987]{0,4}8\\.?9"); + + assertAdded("-[012]{1,5}.*", "![A-Za-z]E([987]?\\.?[987]{0,3}[98]|[987]\\.?[987]{0,4}.\\.?.*)"); + assertAdded("-[012]{1,2}.*", "![A-Za-z]E([987]?\\.?[98]|[987]\\.?[987]{0,1}.\\.?.*)"); + assertAdded("-[012]{0,5}.*", "![A-Za-z]E([987]?\\.?[987]{0,3}[98]|[987]?\\.?[987]{0,4}.\\.?.*)"); + + assertAdded("-[012]{1,5}", "![V-Z]E[987]?\\.?[987]{0,3}[98]"); + assertAdded("-[012]{1,2}", "![Y-Z]E[987]?\\.?[98]"); + assertAdded("-[012]{0,5}", "![V-Z]E[987]?\\.?[987]{0,3}[98]"); + } + + @Test + void testLeadingQuantifiersForCharClassNotContainingZero() { + assertAdded("-[24]*11", "![A-Y]E[75]?\\.?[75]*8\\.?9"); + assertAdded("-[24]+11", "![A-X]E[75]\\.?[75]*89"); + assertAdded("-[24]{3}11", "!VE[75]\\.[75]{2}89"); + assertAdded("-[24]{3,5}11", "![T-V]E[75]\\.[75]{2,4}89"); + assertAdded("-[24]{1,5}11", "![T-X]E[75]\\.[75]{0,4}89"); + assertAdded("-[24]{1,}11", "![A-X]E[75]\\.?[75]*89"); + assertAdded("-[24]{2,}11", "![A-W]E[75]\\.[75]+89"); + assertAdded("-[24]{1,2}11", "![W-X]E[75]\\.[75]{0,1}89"); + assertAdded("-[24]{1}11", "!XE[75]\\.89"); + assertAdded("-[24]{2}11", "!WE[75]\\.[75]89"); + assertAdded("-[24]{0,5}11", "![T-Y]E[75]?\\.[75]{0,4}89"); + + assertAdded("-[24]{1,5}.*", "![A-Z]E([75]?\\.?[75]{0,3}[86]|[75]\\.?[75]{0,4}.+)"); + assertAdded("-[24]{1,2}.*", "![A-Z]E([75]?\\.?[86]|[75]\\.?[75]{0,1}.+)"); + assertAdded("-[24]{0,5}.*", "![A-Z]E([75]?\\.?[75]{0,3}[86]|[75]?\\.?[75]{0,4}.+)"); + + assertAdded("-[24]{1,5}", "![V-Z]E[75]?\\.?[75]{0,3}[86]"); + assertAdded("-[24]{1,2}", "![Y-Z]E[75]?\\.?[86]"); + assertAdded("-[24]{0,5}", "![V-Z]E[75]?\\.?[75]{0,3}[86]"); + } + + /** + * Test patterns that have multiple possible leading zero elements that must all be made optional. + */ + @Test + void testMultiplePossibleLeadingZeros() { + assertAdded("-[30].\\d", "![X-Za-b]E(7|[69]\\.?.|[69]\\.?.\\d)"); + assertAdded("-[30]\\d..*", "![A-Za-z]E(7|[69]\\.?\\d|[69]\\.?\\d.|[69]\\.?\\d..+)"); + assertAdded("-.*54", "![A-Ya-z]E.*4\\.?6"); + assertAdded("-.+54", "![A-Ya-z]E.*4\\.?6"); + assertAdded("-[04]{3}[05]{2}[06]", "![U-Z]E([95]?\\.?[95]{0,1}6|[95]?\\.?[95]{0,2}[94]?\\.?5|[95]?\\.?[95]{0,2}[94]?\\.?[94]{0,1}4)"); + assertAdded("-[04]{3}[05]{2}[06].*.+43", "![A-Ya-z]E[95]?\\.?[95]{0,2}[94]?\\.?[94]{0,1}[93]?\\.?.*.*5\\.?7"); + assertAdded("-[04]{3}0000[05]{2}[06].*.+43", "![A-Ya-z]E[95]?\\.?[95]{0,2}(9{4})?[94]?\\.?[94]{0,1}[93]?\\.?.*.*5\\.?7"); + assertAdded("-[04]{3}0000[05]{2}[06][08]{3,5}.*.+43", + "![A-Ya-z]E[95]?\\.?[95]{0,2}(9{4})?[94]?\\.?[94]{0,1}[93]?\\.?([91]\\.?[91]{2,4})?.*.*5\\.?7"); + } + + /** + * Test patterns similar to those in {@link #testMultiplePossibleLeadingZeros()}, but with a non-leading zero at the beginning, and verify that none of + * the elements after the non-leading zero are made optional. + */ + @Test + void testMultipleNonLeadingZeros() { + assertAdded("-3[30]\\d..*", "![A-X]E(7|6\\.7|6\\.?[69]\\d|6\\.?[69]\\d.|6\\.?[69]\\d..+)"); + assertAdded("-3.*54", "![A-Z]E6\\..*46"); + assertAdded("-3.+54", "![A-Z]E6\\..+46"); + assertAdded("-3[04]{3}[05]{2}[06]", "!TE(7|6\\.[95]{0,2}6|6\\.[95]{0,3}[94]{0,1}5|6\\.[95]{0,3}[94]{0,2}4)"); + assertAdded("-3[04]{3}[05]{2}[06].*.+43", "![A-T]E6\\.[95]{3}[94]{2}[93].*.+57"); + assertAdded("-3[04]{3}0000[05]{2}[06].*.+43", "![A-P]E6\\.[95]{3}9999[94]{2}[93].*.+57"); + assertAdded("-3[04]{3}0000[05]{2}[06][08]{3,5}.*.+43", "![A-M]E6\\.[95]{3}9999[94]{2}[93][91]{3,5}.*.+57"); + } + + /** + * Test patterns similar to those in {@link #testMultiplePossibleLeadingZeros()}, but with a non-leading zero somewhere in the middle. Verify that any + * possible zeros before the first non-leading zeros are made optional, but any succeeding possible zeros are not made optional. + */ + @Test + void testMixedLeadingAndNonLeadingZeros() { + assertAdded("-[30]\\d..*34[05]{2}[04]", "![A-Va-z]E[69]?\\.?\\d?\\.?.?\\.?.*6\\.?(6|5[94]{0,1}5|5[94]{0,2}6)"); + assertAdded("-.*[05]{2}5[05]{2}4", "![A-Wa-z]E.*[94]?\\.?[94]{0,1}4\\.?[94]{2}6"); + assertAdded("-.+[05]{2}54[05]{2}", "![A-Wa-z]E.*[94]?\\.?[94]{0,1}4\\.?(6|5[94]{0,1}5)"); + assertAdded("-[04]{3}[05]{2}33[06][05]{2}", "![Q-V]E[95]?\\.?[95]{0,2}[94]?\\.?[94]{0,1}6\\.?(7|64|6[93][94]{0,1}5)"); + assertAdded("-[04]{3}[05]{2}33[06][05]{2}.*.+", + "![A-V]E[95]?\\.?[95]{0,2}[94]?\\.?[94]{0,1}6\\.?(7|64|6[93][94]{0,1}5|6[93][94]{0,2}.+|6[93][94]{0,2}.*.+)"); + assertAdded("-[04]{3}0000[05]{2}33[06].*.+", "![A-X]E[95]?\\.?[95]{0,2}(9{4})?[94]?\\.?[94]{0,1}6\\.?(7|64|6[93].+|6[93].*.+)"); + assertAdded("-[04]{3}0000[05]{2}33[06][08]{3,5}.*.+", + "![A-U]E[95]?\\.?[95]{0,2}(9{4})?[94]?\\.?[94]{0,1}6\\.?(7|64|6[93][91]{2,4}2|6[93][91]{3,5}.+|6[93][91]{3,5}.*.+)"); + } + } + + public void assertAdded(String pattern, String expectedPattern) { + Node actual = SimpleNumberEncoder.encode(parse(pattern)); + actual = ExponentialBinAdder.addBins(actual); + actual = ZeroTrimmer.trim(actual); + actual = NegativeNumberPatternInverter.invert(actual); + actual = DecimalPointPlacer.addDecimalPoints(actual); + assertThat(actual).asTreeString().isEqualTo(expectedPattern); + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/DecimalPointValidatorTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/DecimalPointValidatorTest.java new file mode 100644 index 00000000000..ed236035fc9 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/DecimalPointValidatorTest.java @@ -0,0 +1,64 @@ +package datawave.data.normalizer.regex.visitor; + +import static datawave.data.normalizer.regex.RegexParser.parse; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import org.junit.jupiter.api.Test; + +class DecimalPointValidatorTest { + + /** + * Verify that validating a null node does not result in an exception. + */ + @Test + void testNullNode() { + assertValid(null); + } + + /** + * Verify that validating an empty regex does not result in an exception. + */ + @Test + void testEmptyRegex() { + assertValid(""); + } + + /** + * Verify that validating sub-expressions with one decimal point does not result in exceptions. + */ + @Test + void testSingleDecimalPoints() { + assertValid("23\\.3"); + assertValid("23\\.3|34\\.343"); + } + + /** + * Verify that validating sub-expressions with more than one decimal point results in exceptions. + */ + @Test + void testMultipleDecimalPoints() { + assertInvalid("34\\.34\\.3"); + assertInvalid("333|.*\\.43\\.34"); + } + + /** + * Verify an alternations with valid combos do not result in an exception. + */ + @Test + void testValidAlternations() { + assertValid("343|65\\.34|45\\.343.*"); + } + + private void assertValid(String pattern) { + validate(pattern); + } + + private void assertInvalid(String pattern) { + assertThatThrownBy(() -> validate(pattern)).isInstanceOf(IllegalArgumentException.class) + .hasMessage("Regex may not contain expressions with than one decimal point."); + } + + private void validate(String pattern) { + DecimalPointValidator.validate(parse(pattern)); + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/EmptyLeafTrimmerTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/EmptyLeafTrimmerTest.java new file mode 100644 index 00000000000..cb1fce1a406 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/EmptyLeafTrimmerTest.java @@ -0,0 +1,74 @@ +package datawave.data.normalizer.regex.visitor; + +import static datawave.data.normalizer.regex.NodeAssert.assertThat; +import static datawave.data.normalizer.regex.RegexParser.parse; + +import org.junit.jupiter.api.Test; + +import datawave.data.normalizer.regex.EmptyNode; +import datawave.data.normalizer.regex.ExpressionNode; +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.visitor.EmptyLeafTrimmer; + +class EmptyLeafTrimmerTest { + + @Test + void testTrimDoesNotModifyOriginal() { + ExpressionNode original = parse("12|()||(45|3)"); + Node trimmed = EmptyLeafTrimmer.trim(original); + + assertThat(original).isEqualTreeTo(parse("12|()||(45|3)")); + assertThat(trimmed).isEqualTreeTo(parse("12|(45|3)")); + } + + @Test + void testTrimmingTreeWithNoEmptyNodes() { + assertNotTrimmed("1|3"); + assertNotTrimmed("(234)"); + assertNotTrimmed("(234)|546"); + } + + @Test + void testTrimmingEmptyAlternations() { + assertTrimmedTo("|3", "3"); + assertTrimmedTo("3||4||5", "3|4|5"); + assertTrimmedTo("3|", "3"); + } + + @Test + void testTrimmingEmptyGroups() { + assertTrimmedTo("()|(35)", "(35)"); + assertTrimmedTo("(2|5|())", "(2|5)"); + } + + @Test + void testTrimmingRegexConsistingOfEmptyAlternationsAndGroups() { + assertTrimmedTo("|()|()", null); + } + + @Test + void testTrimmingEmptyNode() { + assertTrimmedTo("", null); + } + + @Test + void testTrimmingExpressionWithEmptyNode() { + Node node = new ExpressionNode(); + node.addChild(new EmptyNode()); + assertThat(EmptyLeafTrimmer.trim(node)).isNull(); + } + + private void assertNotTrimmed(String pattern) { + assertTrimmedTo(pattern, pattern); + } + + private void assertTrimmedTo(String pattern, String expectedPattern) { + Node actual = EmptyLeafTrimmer.trim(parse(pattern)); + if (expectedPattern == null) { + assertThat(actual).isNull(); + } else { + Node expected = parse(expectedPattern); + assertThat(actual).isEqualTreeTo(expected); + } + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/ExponentialBinAdderTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/ExponentialBinAdderTest.java new file mode 100644 index 00000000000..ef5a84c8123 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/ExponentialBinAdderTest.java @@ -0,0 +1,445 @@ +package datawave.data.normalizer.regex.visitor; + +import static datawave.data.normalizer.regex.NodeAssert.assertThat; +import static datawave.data.normalizer.regex.RegexParser.parse; + +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; + +import datawave.data.normalizer.regex.Node; + +class ExponentialBinAdderTest { + + @Nested + class PositiveVariants { + + @Test + void testLeadingZerosWithoutDecimalPoint() { + // Test leading explicit zero. + assertBins("00345.*", "\\+[c-z]E00345.*"); + + // Test digit character class. + assertBins("\\d0345.*", "\\+[c-z]E\\d0345.*"); + + // Test leading character class that only matches zero. + assertBins("[0]0345.*", "\\+[c-z]E[0]0345.*"); + + // Test leading character class that can match zero and other numbers. + assertBins("[05-7]0345.*", "\\+[c-z]E[05-7]0345.*"); + + // Test leading character class that cannot match zero. + assertBins("[5-7]0345.*", "\\+[e-z]E[5-7]0345.*"); + + // Test leading potential zero without multi-wildcard to test upper bin correctness. + assertBins("0[0-6]06", "\\+[a-c]E0[0-6]06"); + + // Test leading potential zeros with non-multi wildcards. + assertBins("0[0-6]0.06", "\\+[a-eY]E0[0-6]0.06"); + + // Test leading zeros with repetition quantifiers. + assertBins("[06]{3}", "\\+[a-c]E[06]{3}"); + assertBins("[06]{3}[01789]{5}", "\\+[a-h]E[06]{3}[01789]{5}"); + assertBins("[06]{3}[01789]{5}[124678]", "\\+[a-i]E[06]{3}[01789]{5}[124678]"); + } + + @Test + void testLeadingZerosWithDecimalPoint() { + // Test leading explicit zero. + assertBins("00\\.00345.*", "\\+XE00\\.00345.*"); + + // Test leading character class that only matches zero. + assertBins("[0]0\\.0[0]345.*", "\\+XE[0]0\\.0[0]345.*"); + + // Test leading character class that can match zero and other numbers. + assertBins("[05-7]0\\.[045]0345.*", "\\+[bX-Z]E[05-7]0\\.[045]0345.*"); + + // Test leading character class that cannot match zero. + assertBins("0\\.0[5-7]0345.*", "\\+YE0\\.0[5-7]0345.*"); + + // Test leading zeros with repetition quantifiers. + assertBins("\\.[06]{3}", "\\+[W-Z]E\\.[06]{3}"); + assertBins("[06]{3}\\.[01789]{5}[124678]", "\\+[a-cU-Z]E[06]{3}\\.[01789]{5}[124678]"); + assertBins("[06]{3}\\.[01789]{5}[124678]", "\\+[a-cU-Z]E[06]{3}\\.[01789]{5}[124678]"); + assertBins("\\.00[06]{3}.{3}", "\\+[R-X]E\\.00[06]{3}.{3}"); + assertBins("\\.00[06]*", "\\+[A-X]E\\.00[06]*"); + assertBins("\\.00[06]+", "\\+[A-X]E\\.00[06]+"); + } + + @Test + void testWildcards() { + // Test a wildcard in the middle of a whole number. + assertBins("234.65", "\\+[c-f]E234.65"); + + // Test a wildcard before a decimal point. It should count towards the bin. + assertBins("234.65\\.045", "\\+fE234.65\\.045"); + + // Test a wildcard after a decimal point. It should not count towards the bin. + assertBins("234\\.0.", "\\+cE234\\.0."); + + // Test multiple wildcards. The first location of a wildcard should mark the smallest bin number, since we could match against a number with a + // decimal + // point there. + assertBins("87.43.33.33.", "\\+[b-l]E87.43.33.33."); + + // Test single wildcard. + assertBins(".", "\\+aE."); + + // Test single wildcard at beginning up to max bin. + assertBins(".3333333333333333333333333", "\\+[y-zZ]E.3333333333333333333333333"); + + // Test leading zeros with wildcards. + assertBins("0\\.000.000.34", "\\+[R-W]E0\\.000.000.34"); + + assertBins(".*54", "\\+[b-zA-Z]E.*54"); + + assertBins(".+54", "\\+[b-zA-Z]E.+54"); + } + + @Test + void testCharacterClasses() { + // Test a character class in the middle of a whole number. + assertBins("23[3-6]65", "\\+eE23[3-6]65"); + + // Test a character class before a decimal point. It should count towards the bin. + assertBins("[3-6]342\\.34", "\\+dE[3-6]342\\.34"); + + // Test a character class after a decimal point. It should not count towards the bin. + assertBins("234\\.3[3-6]", "\\+cE234\\.3[3-6]"); + + // Test multiple character classes. + assertBins("[3][3-5]35[54]", "\\+eE[3][3-5]35[54]"); + + // Test min GTEOne bin. + assertBins("[3-5]", "\\+aE[3-5]"); + + // Test max GTEOne bin. + assertBins("[3-5]3333333333333333333333333", "\\+zE[3-5]3333333333333333333333333"); + + // Test min LTOne bin. + assertBins("\\.[3-5]", "\\+ZE\\.[3-5]"); + + // Test max LTOne bin. + assertBins("\\.0000000000000000000000000[3-5]", "\\+AE\\.0000000000000000000000000[3-5]"); + + // Test character classes that can be leading zeroes for numbers less than one. + assertBins("0\\.[0-4][03-5][045]3", "\\+[W-Z]E0\\.[0-4][03-5][045]3"); + } + + @Test + void testRepetitions() { + // Test a character class in the middle of a whole number. + assertBins("23{2}65", "\\+eE23{2}65"); + + // Test a character class range before a decimal point. It should count towards the bin. + assertBins("3{4,6}42\\.34", "\\+[f-h]E3{4,6}42\\.34"); + + // Test a character class after decimal point for a non-leading zero character. It should not count towards the bin if it's not after a leading + // zero. + assertBins("234\\.3{3}", "\\+cE234\\.3{3}"); + + // Test repetitions after characters that can be a leading zero. + assertBins("0\\.0{3}34", "\\+WE0\\.0{3}34"); + assertBins("0\\.0{0,5}34", "\\+[U-Z]E0\\.0{0,5}34"); + assertBins("0\\.0{25}34", "\\+AE0\\.0{25}34"); + assertBins("0\\..{3}34", "\\+[W-Z]E0\\..{3}34"); + assertBins("0\\.[0-4]{0,5}34", "\\+[U-Z]E0\\.[0-4]{0,5}34"); + + // Test that range {0,} is treated like .*. + assertBins("5{0,}4", "\\+[a-z]E5{0,}4"); + assertBins("\\.0{0,}4", "\\+[A-Z]E\\.0{0,}4"); + } + + @Test + void testZeroOrMoreQuantifier() { + // Test .* at the start of a number. + assertBins(".*34", "\\+[b-zA-Z]E.*34"); + + // Test .* at the end of a number. + assertBins("34.*", "\\+[b-z]E34.*"); + + // Test .* in the middle of a number. + assertBins("343.*3", "\\+[c-z]E343.*3"); + + // Test .* after a leading zero. + assertBins("0.*343", "\\+[c-zA-Z]E0.*343"); + + // Test .* after leading zero after decimal point. + assertBins("0\\.0.*34", "\\+[A-Y]E0\\.0.*34"); + } + + @Test + void testOneOrMoreQuantifier() { + // Test .+ at the start of a number. + assertBins(".+34", "\\+[b-zA-Z]E.+34"); + + // Test .+ at the end of a number. + assertBins("34.+", "\\+[b-z]E34.+"); + + // Test .+ in the middle of a number. + assertBins("343.+3", "\\+[c-z]E343.+3"); + + // Test .+ after a leading zero. + assertBins("0.+343", "\\+[c-zA-Z]E0.+343"); + + // Test .+ after leading zero after decimal point. + assertBins("0\\.0.+34", "\\+[A-Y]E0\\.0.+34"); + } + + @Test + void testSingleElementPatterns() { + assertBins(".", "\\+aE."); + assertBins(".*", "\\+[a-zA-Z]E.*"); + assertBins(".*?", "\\+[a-zA-Z]E.*?"); + assertBins(".+", "\\+[a-zA-Z]E.+"); + assertBins(".+?", "\\+[a-zA-Z]E.+?"); + assertBins("[14]", "\\+aE[14]"); + assertBins("[14]{3}", "\\+cE[14]{3}"); + assertBins("\\d", "\\+aE\\d"); + assertBins("\\d{3}", "\\+[a-c]E\\d{3}"); + } + + @Test + void testZerosAfterPossibleLeadingZero() { + assertBins("[05-7]000\\.34.*", "\\+[dZ]E[05-7]000\\.34.*"); + + assertBins("0.00\\.*", "\\+[cZ]E0.00\\.*"); + + // Test zeros after a single wildcard. + assertBins(".0000\\.34.*", "\\+[eZ]E.0000\\.34.*"); + + // Test zeros after .*. + assertBins(".*0000\\.34.*", "\\+[e-zZ]E.*0000\\.34.*"); + + // Test zeros after .+. + assertBins(".+0000\\.34.*", "\\+[e-zZ]E.+0000\\.34.*"); + + // Test zeros after multiple wildcards. + assertBins("...0000\\.34.*", "\\+[e-gZ]E...0000\\.34.*"); + + // Test zeros after a digit character class. + assertBins("\\d0000\\.34.*", "\\+[eZ]E\\d0000\\.34.*"); + + // Test zeros after a character class containing zero. + assertBins("[0-4]0000\\.34.*", "\\+[eZ]E[0-4]0000\\.34.*"); + + // Test zeros that are single char classes. + assertBins(".[0][0][0][0]\\.34.*", "\\+[eZ]E.[0][0][0][0]\\.34.*"); + + // Test only zeros in single char classes. + assertBins("[0][0][0]\\.34.*", "\\+ZE[0][0][0]\\.34.*"); + } + + @Test + void testLeadingQuantifiers() { + assertBins(".*54", "\\+[b-zA-Z]E.*54"); + assertBins(".+54", "\\+[b-zA-Z]E.+54"); + assertBins(".{3}54", "\\+[b-eW-Z]E.{3}54"); + assertBins(".{0,3}54", "\\+[b-eW-Z]E.{0,3}54"); + assertBins(".{2,5}54", "\\+[b-gU-Z]E.{2,5}54"); + assertBins(".{2,}54", "\\+[b-zA-Z]E.{2,}54"); + + assertBins(".*0054", "\\+[b-zA-Z]E.*0054"); + assertBins(".+0054", "\\+[b-zA-Z]E.+0054"); + assertBins(".{3}0054", "\\+[b-gU-Z]E.{3}0054"); + assertBins(".{0,3}0054", "\\+[b-gU-Z]E.{0,3}0054"); + assertBins(".{2,5}0054", "\\+[b-iS-Z]E.{2,5}0054"); + assertBins(".{2,}0054", "\\+[b-zA-Z]E.{2,}0054"); + } + + @Test + void testMixedWildcardLocations() { + assertBins("54.00", "\\+[b-e]E54.00"); + assertBins("00.", "\\+aE00."); + assertBins("00.033.324", "\\+[b-hY]E00.033.324"); + assertBins("00.034323.34", "\\+[e-jY]E00.034323.34"); + assertBins("00.0[06][09].33", "\\+[b-gV-Y]E00.0[06][09].33"); + assertBins(".000.0\\.34.*", "\\+[b-fZ]E.000.0\\.34.*"); + } + } + + @Nested + class NegativeVariants { + + @Test + void testLeadingZerosWithoutDecimalPoint() { + // Test leading explicit zero. + assertBins("-00345.*", "![A-X]E00345.*"); + + // Test digit character class. + assertBins("-\\d0345.*", "![A-X]E\\d0345.*"); + + // Test leading character class that only matches zero. + assertBins("-[0]0345.*", "![A-X]E[0]0345.*"); + + // Test leading character class that can match zero and other numbers. + assertBins("-[05-7]0345.*", "![A-X]E[05-7]0345.*"); + + // Test leading character class that cannot match zero. + assertBins("-[5-7]0345.*", "![A-V]E[5-7]0345.*"); + + // Test leading potential zero without multi-wildcard to test upper bin correctness. + assertBins("-0[0-6]06", "![X-Z]E0[0-6]06"); + + // Test leading potential zeros with non-multi wildcards. + assertBins("-0[0-6]0.06", "![V-Zb]E0[0-6]0.06"); + + // Test leading zeros with repetition quantifiers. + assertBins("-[06]{3}", "![X-Z]E[06]{3}"); + assertBins("-[06]{3}[01789]{5}", "![S-Z]E[06]{3}[01789]{5}"); + assertBins("-[06]{3}[01789]{5}[124678]", "![R-Z]E[06]{3}[01789]{5}[124678]"); + } + + @Test + void testLeadingZerosWithDecimalPoint() { + // Test leading explicit zero. + assertBins("-00\\.00345.*", "!cE00\\.00345.*"); + + // Test leading character class that only matches zero. + assertBins("-[0]0\\.0[0]345.*", "!cE[0]0\\.0[0]345.*"); + + // Test leading character class that can match zero and other numbers. + assertBins("-[05-7]0\\.[045]0345.*", "![Ya-c]E[05-7]0\\.[045]0345.*"); + + // Test leading character class that cannot match zero. + assertBins("-0\\.0[5-7]0345.*", "!bE0\\.0[5-7]0345.*"); + + // Test leading zeros with repetition quantifiers. + assertBins("-\\.[06]{3}", "![a-d]E\\.[06]{3}"); + assertBins("-[06]{3}\\.[01789]{5}[124678]", "![X-Za-f]E[06]{3}\\.[01789]{5}[124678]"); + assertBins("-[06]{3}\\.[01789]{5}[124678]", "![X-Za-f]E[06]{3}\\.[01789]{5}[124678]"); + } + + @Test + void testWildcards() { + // Test a wildcard in the middle of a whole number. + assertBins("-234.65", "![U-X]E234.65"); + + // Test a wildcard before a decimal point. It should count towards the bin. + assertBins("-234.65\\.045", "!UE234.65\\.045"); + + // Test a wildcard after a decimal point. It should not count towards the bin. + assertBins("-234\\.0.", "!XE234\\.0."); + + // Test multiple wildcards. The first location of a wildcard should mark the smallest bin number, since we could match against a number with a + // decimal + // point there. + assertBins("-87.43.33.33.", "![O-Y]E87.43.33.33."); + + // Test min bin. + assertBins("-.", "!ZE."); + + // Test max bin. + assertBins("-.3333333333333333333333333", "![A-Ba]E.3333333333333333333333333"); + + assertBins("-0\\.000.000.34", "![d-i]E0\\.000.000.34"); + } + + @Test + void testCharacterClasses() { + // Test a character class in the middle of a whole number. + assertBins("-23[3-6]65", "!VE23[3-6]65"); + + // Test a character class before a decimal point. It should count towards the bin. + assertBins("-[3-6]342\\.34", "!WE[3-6]342\\.34"); + + // Test a character class after a decimal point. It should not count towards the bin. + assertBins("-234\\.3[3-6]", "!XE234\\.3[3-6]"); + + // Test multiple character classes. + assertBins("-[3][3-5]35[54]", "!VE[3][3-5]35[54]"); + + // Test min GTEOne bin. + assertBins("-[3-5]", "!ZE[3-5]"); + + // Test max GTEOne bin. + assertBins("-[3-5]3333333333333333333333333", "!AE[3-5]3333333333333333333333333"); + + // Test min LTOne bin. + assertBins("-\\.[3-5]", "!aE\\.[3-5]"); + + // Test max LTOne bin. + assertBins("-\\.0000000000000000000000000[3-5]", "!zE\\.0000000000000000000000000[3-5]"); + + // Test character classes that can be leading zeroes for numbers less than one. + assertBins("-0\\.[0-4][03-5][045]3", "![a-d]E0\\.[0-4][03-5][045]3"); + } + + @Test + void testRepetitions() { + // Test a character class in the middle of a whole number. + assertBins("-23{2}65", "!VE23{2}65"); + + // Test a character class range before a decimal point. It should count towards the bin. + assertBins("-3{4,6}42\\.34", "![S-U]E3{4,6}42\\.34"); + + // Test a character class after decimal point for a non-leading zero character. It should not count towards the bin if it's not after a leading + // zero. + assertBins("-234\\.3{3}", "!XE234\\.3{3}"); + + // Test repetitions after characters that can be a leading zero. + assertBins("-0\\.0{3}34", "!dE0\\.0{3}34"); + assertBins("-0\\.0{0,5}34", "![a-f]E0\\.0{0,5}34"); + assertBins("-0\\.0{25}34", "!zE0\\.0{25}34"); + assertBins("-0\\..{3}34", "![a-d]E0\\..{3}34"); + assertBins("-0\\.[0-4]{0,5}34", "![a-f]E0\\.[0-4]{0,5}34"); + + // Test that range {0,} is treated like .*. + assertBins("-5{0,}4", "![A-Z]E5{0,}4"); + assertBins("-\\.0{0,}4", "![a-z]E\\.0{0,}4"); + } + + @Test + void testZeroOrMoreQuantifier() { + // Test .* at the start of a number. + assertBins("-.*34", "![A-Ya-z]E.*34"); + + // Test .* at the end of a number. + assertBins("-34.*", "![A-Y]E34.*"); + + // Test .* in the middle of a number. + assertBins("-343.*3", "![A-X]E343.*3"); + + // Test .* after a leading zero. + assertBins("-0.*343", "![A-Xa-z]E0.*343"); + + // Test .* after leading zero after decimal point. + assertBins("-0\\.0.*34", "![b-z]E0\\.0.*34"); + } + + @Test + void testOneOrMoreQuantifier() { + // Test .+ at the start of a number. + assertBins("-.+34", "![A-Ya-z]E.+34"); + + // Test .+ at the end of a number. + assertBins("-34.+", "![A-Y]E34.+"); + + // Test .+ in the middle of a number. + assertBins("-343.+3", "![A-X]E343.+3"); + + // Test .+ after a leading zero. + assertBins("-0.+343", "![A-Xa-z]E0.+343"); + + // Test .+ after leading zero after decimal point. + assertBins("-0\\.0.+34", "![b-z]E0\\.0.+34"); + } + } + + @Test + void testEncodedNumbersAreNotModified() { + Node tree = SimpleNumberEncoder.encode(parse("234|54.*")); + Node enriched = ExponentialBinAdder.addBins(tree); + assertThat(enriched).asTreeString().isEqualTo("\\+cE2\\.34|\\+[b-z]E54.*"); + // Validate the tree structure. + // @formatter:off + assertThat(enriched).assertChild(0).isAlternationNode() + .assertChild(0).isEncodedNumberNode().assertParent() + .assertChild(1).isEncodedPatternNode(); + // @formatter:on + } + + private void assertBins(String pattern, String expectedPattern) { + Node actual = ExponentialBinAdder.addBins(parse(pattern)); + assertThat(actual).asTreeString().isEqualTo(expectedPattern); + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/NegativeNumberPatternInverterTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/NegativeNumberPatternInverterTest.java new file mode 100644 index 00000000000..b946c662401 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/NegativeNumberPatternInverterTest.java @@ -0,0 +1,125 @@ +package datawave.data.normalizer.regex.visitor; + +import static datawave.data.normalizer.regex.NodeAssert.assertThat; +import static datawave.data.normalizer.regex.RegexParser.parse; + +import org.junit.jupiter.api.Test; + +import datawave.data.normalizer.regex.Node; +import datawave.data.type.util.NumericalEncoder; + +class NegativeNumberPatternInverterTest { + + /** + * Verify that patterns consisting of simple numbers are not modified by {@link NegativeNumberPatternInverter}. + */ + @Test + void testPatternsMadeOfSimpleNumbers() { + // Test a single positive number. + assertInverted("345", "\\+cE3\\.45"); + + // Test a single negative number. + assertInverted("-345", "!XE6\\.55"); + + // Test alternated positive and negative number. + assertInverted("345|-345", "\\+cE3\\.45|!XE6\\.55"); + } + + /** + * Verify that patterns consisting of positive number patterns are not modified by {@link NegativeNumberPatternInverter}. + */ + @Test + void testPatternsMadeOfPositivePatterns() { + // Single positive number pattern. + assertInverted(".*345", "\\+[c-zA-Z]E.*345"); + + // Alternated positive number patterns. + assertInverted("45.*|0.045[3-5]", "\\+[b-z]E45.*|\\+[c-eY]E.?0?45[3-5]"); + } + + /** + * Verify that patterns consisting of positive number patterns and simple numbers are not modified by {@link NegativeNumberPatternInverter}. + */ + @Test + void testPatternsMadeOfPositivePatternsAndSimpleNumbers() { + // Alternated with positive simple number. + assertInverted("345.*|456", "\\+[c-z]E345.*|\\+cE4\\.56"); + + // Alternated with negative simple number. + assertInverted(".*345|-456", "\\+[c-zA-Z]E.*345|!XE5\\.44"); + + // Alternated with positive and negative simple number. + assertInverted("45.*|0.045[3-5]|456|-456", "\\+[b-z]E45.*|\\+[c-eY]E.?0?45[3-5]|\\+cE4\\.56|!XE5\\.44"); + } + + @Test + void testWildcard() { + assertInverted("-.234", "![W-Xa]E.?766"); + assertInverted("-34.454", "![U-Y]E65.546"); + assertInverted("-34454.", "![U-V]E6554(6|5.)"); + } + + @Test + void testMultiWildcards() { + assertInverted("-.*234", "![A-Xa-z]E.*766"); + assertInverted("-.+234", "![A-Xa-z]E.*766"); + assertInverted("-0.00454.*", "![A-Xc]E.?(9{2})?54(6|5.+)"); + } + + @Test + void testCharacterClasses() { + assertInverted("-[2-4]", "!ZE[6-8]"); + assertInverted("-[1357]", "!ZE[9753]"); + assertInverted("-[46-8]", "!ZE[62-4]"); + } + + @Test + void testDigitCharacterClass() { + assertInverted("-\\d", "!ZE\\d"); + assertInverted("-\\d*", "![A-Z]E\\d+"); + } + + @Test + void testConsolidatedLeadingZeros() { + // The consolidated zeros in (0{3})? should not be modified for a positive expression. + assertInverted(".*000.*3", "\\+[a-zA-Z]E.*(0{3})?.*3"); + + // However, in a positive expression, they should get negated to a value of 9. + assertInverted("-.*000.*3", "![A-Za-z]E.*(9{3})?.*7"); + + // Even if there could possibly be no elements after the consolidated zeros, they should get negated to a value of 9. + assertInverted("-3.*000.*", "![A-Z]E(7|6.+|6.*9{3}.+)"); + } + + @Test + void testTrailingZeros() { + assertInverted("-22[0-4]", "!XE7(8|7[6-9])"); + assertInverted("-22[0157]", "!XE7(8|7[953])"); + assertInverted("-22[0157]*", "![A-Y]E7(8|7[9842]*[953])"); + assertInverted("-22[0157]+", "![A-X]E7(8|7[9842]*[953])"); + assertInverted("-22[0157]{3}", "!VE7(8|7[9842]{0,2}[953])"); + assertInverted("-22[0157]{1,3}", "![V-X]E7(8|7[9842]{0,2}[953])"); + assertInverted("-[1369]*2[0157]{1,3}0{1,3}", "![A-X]E[8630]*(8|7[9842]{0,2}[953])"); + assertInverted("-22[^03]{1,3}[06]{3,4}", "![R-U]E77([^96]{0,2}[^7]|[^96]{1,3}[93]{2,3}4)"); + } + + @Test + void testRepetitions() { + assertInverted("-4{3}", "!XE5{2}6"); + assertInverted("-4{3,6}", "![U-X]E5{2,5}6"); + assertInverted("-4{0,6}", "![U-Z]E5{0,5}6"); + assertInverted("-4{1,4}", "![W-Z]E5{0,3}6"); + assertInverted("-4{1,}", "![A-Z]E5{0,}6"); + assertInverted("-4{2,}", "![A-Y]E5{1,}6"); + } + + public void assertInverted(String pattern, String expectedPattern) { + Node actual = SimpleNumberEncoder.encode(parse(pattern)); + actual = ExponentialBinAdder.addBins(actual); + actual = ZeroTrimmer.trim(actual); + actual = NegativeNumberPatternInverter.invert(actual); + + assertThat(actual).asTreeString().isEqualTo(expectedPattern); + } + +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/NegativeVariantExpanderTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/NegativeVariantExpanderTest.java new file mode 100644 index 00000000000..ce2c91d1d91 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/NegativeVariantExpanderTest.java @@ -0,0 +1,66 @@ +package datawave.data.normalizer.regex.visitor; + +import static datawave.data.normalizer.regex.NodeAssert.assertThat; +import static datawave.data.normalizer.regex.RegexParser.parse; + +import org.junit.jupiter.api.Test; + +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.visitor.NegativeVariantExpander; + +class NegativeVariantExpanderTest { + + @Test + void testNullNode() { + assertNotExpanded(null); + } + + @Test + void testEmptyNode() { + assertNotExpanded(""); + } + + @Test + void testRegexesWithoutLeadingWildcards() { + assertNotExpanded("234.*"); + assertNotExpanded("\\..*"); + assertNotExpanded("-\\.34.*"); + assertNotExpanded("-\\.34.+"); + assertNotExpanded("[34]90.+"); + assertNotExpanded("[34]90.+"); + + // Leading wildcards with a negative sign in front of them do not need to be expanded. + assertNotExpanded("-.78"); + assertNotExpanded("-.*78"); + assertNotExpanded("-.*?78"); + assertNotExpanded("-.+78"); + assertNotExpanded("-.+?78"); + } + + @Test + void testRegexesWithLeadingWildcards() { + // Leading wildcards with no negative sign in front need to be expanded to include a negative variant. + assertExpandedTo(".454", ".454|-.454"); + assertExpandedTo(".*455", ".*455|-.*455"); + assertExpandedTo(".*?455", ".*?455|-.*?455"); + assertExpandedTo(".+455", ".+455|-.+455"); + assertExpandedTo(".+?455", ".+?455|-.+?455"); + + // Test alternations. + assertExpandedTo(".455|343|[9]34.*", ".455|-.455|343|[9]34.*"); + } + + private void assertNotExpanded(String pattern) { + assertExpandedTo(pattern, pattern); + } + + private void assertExpandedTo(String pattern, String expectedPattern) { + Node actual = NegativeVariantExpander.expand(parse(pattern)); + if (expectedPattern == null) { + assertThat(actual).isNull(); + } else { + Node expected = parse(expectedPattern); + assertThat(actual).isEqualTreeTo(expected); + } + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/NonEncodedNumbersCheckerTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/NonEncodedNumbersCheckerTest.java new file mode 100644 index 00000000000..e35447a7e84 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/NonEncodedNumbersCheckerTest.java @@ -0,0 +1,48 @@ +package datawave.data.normalizer.regex.visitor; + +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; + +import org.junit.jupiter.api.Test; + +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.RegexParser; + +class NonEncodedNumbersCheckerTest { + + @Test + void testSingleSimpleNumber() { + assertHasNoNonEncodedNumbers(encodeSimpleNumbers("123")); + } + + @Test + void testSingleNonSimpleNumber() { + assertHasNonEncodedNumbers(encodeSimpleNumbers("2342.*")); + } + + @Test + void testAlternatedSimpleNumbers() { + assertHasNoNonEncodedNumbers(encodeSimpleNumbers("234|-45345")); + } + + @Test + void testAlternatedNonSimpleNumbers() { + assertHasNonEncodedNumbers(encodeSimpleNumbers("234.*|65{3}")); + } + + @Test + void testAlternatedSimpleNumberAndNonSimpleNumber() { + assertHasNonEncodedNumbers(encodeSimpleNumbers("324.*|345")); + } + + private Node encodeSimpleNumbers(String pattern) { + return SimpleNumberEncoder.encode(RegexParser.parse(pattern)); + } + + private void assertHasNonEncodedNumbers(Node node) { + assertThat(NonEncodedNumbersChecker.check(node)).isTrue(); + } + + private void assertHasNoNonEncodedNumbers(Node node) { + assertThat(NonEncodedNumbersChecker.check(node)).isFalse(); + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/NumericCharClassValidatorTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/NumericCharClassValidatorTest.java new file mode 100644 index 00000000000..c6133cf6502 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/NumericCharClassValidatorTest.java @@ -0,0 +1,84 @@ +package datawave.data.normalizer.regex.visitor; + +import static datawave.data.normalizer.regex.RegexParser.parse; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import org.junit.jupiter.api.Test; + +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.visitor.NumericCharClassValidator; + +class NumericCharClassValidatorTest { + + /** + * Verify that validating a null node does not result in an exception. + */ + @Test + void testNullNode() { + assertValid(null); + } + + /** + * Verify that validating an empty regex does not result in an exception. + */ + @Test + void testEmptyRegex() { + assertValid(""); + } + + /** + * Verify that validating regexes without character classes do not result in exceptions. + */ + @Test + void testRegexWithoutCharacterClass() { + assertValid("123.*"); + assertValid("123.*{34}"); + assertValid("(234|34534)|343.*343.?"); + } + + /** + * Verify that validating regexes with valid character classes do not result in exceptions. + */ + @Test + void testRegexWithValidCharacterClass() { + // Allow all digits (including negated). + assertValid("[123456789]"); + assertValid("[^123456789]"); + + // Allow numeric ranges (including negated). + assertValid("[0-9]"); + assertValid("[^0-9]"); + + // Allow combinations (including negated). + assertValid("[137-9]"); + assertValid("[^137-9]"); + } + + /** + * Verify that validating regexes with invalid character classes results in exceptions. + */ + @Test + void testInvalidRegexes() { + // Do not allow periods. + assertInvalid("[.]"); + assertInvalid("[^.]"); + assertInvalid("[123.]"); + + // Do not allow letter ranges. + assertInvalid("[a-z]"); + assertInvalid("[A-Z]"); + } + + private void assertValid(String pattern) { + validate(parse(pattern)); + } + + private void assertInvalid(String pattern) { + assertThatThrownBy(() -> validate(parse(pattern))).isInstanceOf(IllegalArgumentException.class) + .hasMessage("Character classes may only contain numeric characters and numeric ranges."); + } + + private void validate(Node node) { + NumericCharClassValidator.validate(node); + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/OptionalVariantExpanderTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/OptionalVariantExpanderTest.java new file mode 100644 index 00000000000..d75ff241582 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/OptionalVariantExpanderTest.java @@ -0,0 +1,83 @@ +package datawave.data.normalizer.regex.visitor; + +import static datawave.data.normalizer.regex.NodeAssert.assertThat; +import static datawave.data.normalizer.regex.RegexParser.parse; + +import org.junit.jupiter.api.Test; + +import datawave.data.normalizer.regex.Node; + +class OptionalVariantExpanderTest { + + @Test + void testExpandingEmptyPattern() { + assertNotExpanded(""); + } + + @Test + void testExpandingPatternsWithoutOptionals() { + assertNotExpanded("123.*"); + assertNotExpanded("^(123.)$"); + assertNotExpanded(".*1234"); + assertNotExpanded(".{3}54[3-6]"); + assertNotExpanded("343|34.*|6534.*|23\\.[34]{4}"); + } + + /** + * Verify that any ? directly following a *, +, or a repetition quantifier are not expanded, and are kept to enforce lazy matching. + */ + @Test + void testExpandingPatternsWithLazyModifiers() { + assertNotExpanded("234.*?"); + assertNotExpanded("234.+?"); + assertNotExpanded(".*?234"); + assertNotExpanded(".+?234"); + assertNotExpanded("2.*?4"); + assertNotExpanded("2.+?4"); + assertNotExpanded("34{4}?"); + } + + @Test + void testExpandingPatternsWithOptionals() { + // An optional element located after a decimal point should not be expanded. + assertNotExpanded("232\\.4[3-6]?"); + + // An optional decimal point should be expanded. + assertExpandedTo("3\\.?6", "36|3\\.6"); + + // Only the [4-6]? needs to be expanded to variants. + assertExpandedTo(".*?35[4-6]?\\.34?", ".*?35\\.34?|.*?35[4-6]\\.34?"); + + // Optionals following other characters should be expanded. + assertExpandedTo("23?4", "24|234"); + assertExpandedTo("3[3-9]?6", "36|3[3-9]6"); + assertExpandedTo("3.?6", "36|3.6"); + assertExpandedTo("3(4.3)?5", "35|3(4.3)5"); + assertExpandedTo("-?34", "34|-34"); + + // Multiple optionals should result in multiple expansion variants. + assertExpandedTo("3.?4[36]?8?", "34|348|34[36]|34[36]8|3.4|3.48|3.4[36]|3.4[36]8"); + + // Test pattern with optional at very end. + assertExpandedTo("23?", "2|23"); + + // Test pattern of single optional character. + assertExpandedTo("2?", "2"); + + // Optionals within alternations should be expanded. + assertExpandedTo("23?4|3.?6", "24|234|36|3.6"); + } + + private void assertNotExpanded(String pattern) { + assertExpandedTo(pattern, pattern); + } + + private void assertExpandedTo(String pattern, String expectedPattern) { + Node actual = OptionalVariantExpander.expand(parse(pattern)); + if (expectedPattern == null) { + assertThat(actual).isNull(); + } else { + assertThat(actual).asTreeString().isEqualTo(expectedPattern); + } + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/SimpleNumberEncoderTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/SimpleNumberEncoderTest.java new file mode 100644 index 00000000000..3f98fc00a0d --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/SimpleNumberEncoderTest.java @@ -0,0 +1,78 @@ +package datawave.data.normalizer.regex.visitor; + +import static datawave.data.normalizer.regex.NodeAssert.assertThat; + +import org.junit.jupiter.api.Test; + +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.RegexParser; + +class SimpleNumberEncoderTest { + + @Test + void testEmpty() { + assertNotEncoded(""); + } + + @Test + void testPatternsWithoutSimpleNumbers() { + assertNotEncoded("12.*"); + assertNotEncoded("12[3-5]"); + assertNotEncoded("12{3}"); + assertNotEncoded("12\\d"); + assertNotEncoded("12."); + assertNotEncoded("12+"); + assertNotEncoded("12?"); + assertNotEncoded("(12)."); + assertNotEncoded("12.?|.*45|43.*"); + } + + @Test + void testSinglePositiveSimpleNumber() { + Node actual = assertEncoded("123\\.45", "\\+cE1\\.2345"); + // Verify an encoded number node was returned. + assertThat(actual).isEncodedNumberNode(); + } + + @Test + void testSingleNegativeNumber() { + Node actual = assertEncoded("-342", "!XE6\\.58"); + // Verify an encoded number node was returned. + assertThat(actual).isEncodedNumberNode(); + } + + @Test + void testAlternatedPositiveSimpleNumberAndNonSimpleNumber() { + Node actual = assertEncoded("-342|23.*", "!XE6\\.58|23.*"); + // Verify that the alternation node has an encoded number node (0) and an expression node (1) as children. + assertThat(actual).assertChild(0).isAlternationNode().assertChild(0).isEncodedNumberNode().assertParent().assertChild(1).isExpressionNode(); + } + + @Test + void testAlternatedNegativeSimpleNumberAndNonSimpleNumber() { + Node actual = assertEncoded("-34.*|23", "-34.*|\\+bE2\\.3"); + // Verify that the alternation node has an expression node (0) and an encoded number node (0) as children. + assertThat(actual).assertChild(0).isAlternationNode().assertChild(0).isExpressionNode().assertParent().assertChild(1).isEncodedNumberNode(); + } + + @Test + void testAlternatedPositiveAndNegativeSimpleNumber() { + Node actual = assertEncoded("5345|-4452", "\\+dE5\\.345|!WE5\\.548"); + // Verify that the alternation node has two encoded number nodes as children. + assertThat(actual).assertChild(0).isAlternationNode().assertChild(0).isEncodedNumberNode().assertParent().assertChild(1).isEncodedNumberNode(); + } + + private void assertNotEncoded(String pattern) { + assertEncoded(pattern, pattern); + } + + private Node assertEncoded(String pattern, String expectedPattern) { + Node actual = SimpleNumberEncoder.encode(RegexParser.parse(pattern)); + if (expectedPattern == null) { + assertThat(actual).isNull(); + } else { + assertThat(actual).asTreeString().isEqualTo(expectedPattern); + } + return actual; + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/StringVisitorTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/StringVisitorTest.java new file mode 100644 index 00000000000..cf3aa4644e4 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/StringVisitorTest.java @@ -0,0 +1,44 @@ +package datawave.data.normalizer.regex.visitor; + +import static datawave.data.normalizer.regex.RegexParser.parse; +import static org.assertj.core.api.Assertions.assertThat; + +import org.junit.jupiter.api.Test; + +import datawave.data.normalizer.regex.EmptyNode; +import datawave.data.normalizer.regex.ExpressionNode; +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.RegexParser; +import datawave.data.normalizer.regex.visitor.StringVisitor; + +class StringVisitorTest { + + @Test + void testNullNode() { + assertThat(toString(null)).isNull(); + } + + @Test + void testEmptyExpression() { + ExpressionNode node = new ExpressionNode(); + node.addChild(new EmptyNode()); + assertThat(toString(node)).isEqualTo(""); + } + + @Test + void testComplexTrees() { + assertThat(toString(parse("-234\\.3"))).isEqualTo("-234\\.3"); + assertThat(toString(parse("234.*"))).isEqualTo("234.*"); + assertThat(toString(parse("234[^65.]"))).isEqualTo("234[^65.]"); + assertThat(toString(parse("^2{3}.+"))).isEqualTo("^2{3}.+"); + assertThat(toString(parse("2{3,}.*"))).isEqualTo("2{3,}.*"); + assertThat(toString(parse("2{2,4}.*"))).isEqualTo("2{2,4}.*"); + assertThat(toString(parse("(23|65)"))).isEqualTo("(23|65)"); + assertThat(toString(parse("(23|65)|(34[65].*)"))).isEqualTo("(23|65)|(34[65].*)"); + assertThat(toString(parse("35\\d.+"))).isEqualTo("35\\d.+"); + } + + private String toString(Node node) { + return StringVisitor.toString(node); + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/ZeroLengthRepetitionTrimmerTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/ZeroLengthRepetitionTrimmerTest.java new file mode 100644 index 00000000000..2e21afd17c1 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/ZeroLengthRepetitionTrimmerTest.java @@ -0,0 +1,64 @@ +package datawave.data.normalizer.regex.visitor; + +import static datawave.data.normalizer.regex.NodeAssert.assertThat; +import static datawave.data.normalizer.regex.RegexParser.parse; + +import org.junit.jupiter.api.Test; + +import datawave.data.normalizer.regex.Node; + +class ZeroLengthRepetitionTrimmerTest { + + @Test + void testNullNode() { + assertNotTrimmed(null); + } + + @Test + void testEmptyRegex() { + assertNotTrimmed(""); + } + + @Test + void testRegexWithoutRepetitions() { + assertNotTrimmed("123.*"); + assertNotTrimmed("(234|34534)|343.*343.?"); + } + + @Test + void testRegexWithValidRepetitions() { + // Any any non-zero combination. + assertNotTrimmed("2{1}"); + assertNotTrimmed("2{12}"); + assertNotTrimmed("2{1,6}"); + assertNotTrimmed("2{10,20}"); + + // Allow {0,} as an equivalent to *. + assertNotTrimmed("2{0,}"); + } + + @Test + void testInvalidRegexes() { + assertTrimmedTo("2{0}", null); + assertTrimmedTo("2{0,0}", null); + assertTrimmedTo("3{0,0}|[4-6]{0}", null); + assertTrimmedTo("23.*5{0}", "23.*"); + assertTrimmedTo("23.*5{0,0}", "23.*"); + assertTrimmedTo("23.*5{0,0}|65{3}", "23.*|65{3}"); + } + + private void assertNotTrimmed(String pattern) { + assertTrimmedTo(pattern, pattern); + } + + private void assertTrimmedTo(String pattern, String expectedPattern) { + Node actual = ZeroLengthRepetitionTrimmer.trim(parse(pattern)); + if (expectedPattern == null) { + PrintVisitor.printToSysOut(actual); + assertThat(actual).isNull(); + } else { + Node expected = parse(expectedPattern); + assertThat(actual).isEqualTreeTo(expected); + } + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/ZeroTrimmerTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/ZeroTrimmerTest.java new file mode 100644 index 00000000000..d1569da77fc --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/ZeroTrimmerTest.java @@ -0,0 +1,362 @@ +package datawave.data.normalizer.regex.visitor; + +import static datawave.data.normalizer.regex.NodeAssert.assertThat; +import static datawave.data.normalizer.regex.RegexParser.parse; + +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; +import org.locationtech.jts.util.Assert; + +import datawave.data.normalizer.ZeroRegexStatus; +import datawave.data.normalizer.regex.Node; +import datawave.data.normalizer.regex.RegexParser; + +class ZeroTrimmerTest { + + @Nested + class LeadingZeros { + + @Test + void testZerosWithoutQuantifiers() { + // Test trimming explicit zeros in pattern without decimal point. + assertTrimmedTo("00345.*", "\\+[c-z]E345.*"); + + // Test trimming explicit zeros in pattern before decimal point. + assertTrimmedTo("000\\.34.*", "\\+ZE34.*"); + + // Test trimming explicit zeros in pattern before and after the decimal point. + assertTrimmedTo("00\\.000034.*", "\\+VE34.*"); + + // Test trimming explicit zeros in pattern after decimal point. + assertTrimmedTo("\\.000034.*", "\\+VE34.*"); + + // Test trimming explicit zeros (including [0]) in pattern after decimal point. + assertTrimmedTo("\\.0[0][0]034.*", "\\+VE34.*"); + } + + @Test + void testZerosWithQuantifiers() { + // Test trimming leading zeros with * quantifier. + assertTrimmedTo("0*0345.*", "\\+[c-z]E345.*"); + + // Test trimming leading zeros with + quantifier. + assertTrimmedTo("0+0\\.34.*", "\\+ZE34.*"); + + // Test trimming leading zeros with {3} quantifier. + assertTrimmedTo("00\\.0[0]{3}034.*", "\\+UE34.*"); + + // Test trimming leading zeros with {3,5} quantifier. + assertTrimmedTo("\\.0{3,5}00034.*", "\\+[R-T]E34.*"); + + // Test that 0* with no other zeros. + assertTrimmedTo("0*34", "\\+bE34"); + + // Test that 0+ with no other zeros. + assertTrimmedTo("0+34", "\\+bE34"); + + // Test that 0* with other zeros. + assertTrimmedTo("00*0034", "\\+bE34"); + + // Test that 0+ with other zeros. + assertTrimmedTo("00+0034", "\\+bE34"); + + // Test that zeros with repetitions. + assertTrimmedTo("00{3}0{2}34", "\\+bE34"); + + // Test that a + after a possible zero is changed to a *. + assertTrimmedTo("[06]+\\.[01789]*[124678]", "\\+[a-zA-Z]E[06]*[01789]*[124678]"); + + // Test that a +? after a possible zero is changed to a *?. + assertTrimmedTo("[06]+?\\.[01789]*[124678]", "\\+[a-zA-Z]E[06]*?[01789]*[124678]"); + + // Test that {x} quantifiers after a possible zero will include a possible length of zero. + assertTrimmedTo("[06]{3}\\.[01789]{5}[124678]", "\\+[a-cU-Z]E[06]{0,3}[01789]{0,5}[124678]"); + + // Test that {x}? quantifiers after a possible zero will include a possible length of zero. + assertTrimmedTo("[06]{3}?\\.[01789]{5}[124678]", "\\+[a-cU-Z]E[06]{0,3}?[01789]{0,5}[124678]"); + + // Test that {x,y} quantifiers after a possible zero will be made optional. + assertTrimmedTo("[06]{4,7}\\.[01789]{3,9}[124678]", "\\+[a-gQ-Z]E([06]{4,7})?([01789]{3,9})?[124678]"); + + // Test that {x,y}? quantifiers after a possible zero will be made optional. + assertTrimmedTo("[06]{4,7}?\\.[01789]{3,9}?[124678]", "\\+[a-gQ-Z]E([06]{4,7}?)?([01789]{3,9}?)?[124678]"); + } + + @Test + void testZerosAfterWildcard() { + // Test trimming explicit zeros after a single wildcard. + assertTrimmedTo(".0000\\.34.*", "\\+[eZ]E.?(0{4})?34.*"); + + // Test trimming explicit zeros after .*. + assertTrimmedTo(".*0000\\.34.*", "\\+[e-zZ]E.*(0{4})?34.*"); + + // Test trimming explicit zeros after .+. + assertTrimmedTo(".+0000\\.34.*", "\\+[e-zZ]E.*(0{4})?34.*"); + } + + @Test + void testZerosAfterMultipleWildcards() { + // Test trimming explicit zeros after a single wildcard. + assertTrimmedTo(".000.0\\.34.*", "\\+[b-fZ]E.?(0{3})?.?0?34.*"); + + // Test trimming explicit zeros after .*. + assertTrimmedTo(".*000.*0\\.34.*", "\\+[b-zZ]E.*(0{3})?.*0?34.*"); + + // Test trimming explicit zeros after .+. + assertTrimmedTo(".+000.+0\\.34.*", "\\+[b-zZ]E.*(0{3})?.*0?34.*"); + } + + @Test + void testZerosAfterDecimalPointWithPossibleAllLeadingZeros() { + assertTrimmedTo(".\\.000034.*", "\\+[aV]E.?(0{4})?34.*"); + assertTrimmedTo(".*\\.000034.*", "\\+[a-zV]E.*(0{4})?34.*"); + assertTrimmedTo(".+\\.000034.*", "\\+[a-zV]E.*(0{4})?34.*"); + assertTrimmedTo(".0{3}\\.000034.*", "\\+[dV]E.?(0{7})?34.*"); + assertTrimmedTo("[034]0\\.000034.*", "\\+[bV]E[034]?(0{5})?34.*"); + } + + @Test + void testZerosAfterPossibleZeroCharacter() { + assertTrimmedTo(".000000343", "\\+[c-jT]E.?(0{6})?343"); + assertTrimmedTo(".*000000343", "\\+[c-zA-Z]E.*(0{6})?343"); + assertTrimmedTo(".+000000343", "\\+[c-zA-Z]E.*(0{6})?343"); + assertTrimmedTo("[0-9]000000343", "\\+[c-j]E[0-9]?(0{6})?343"); + } + + @Test + void testZerosWithRepetitionRange() { + // Test 0{0,}, equivalent to 0* + assertTrimmedTo(".*0{0,}3", "\\+[a-zA-Z]E.*(0*)?3"); + + // Test 0{0,} with other zeros. + assertTrimmedTo(".*00{0,}03", "\\+[a-zA-Z]E.*(0{2,})?3"); + + // Test 0{1,}, equivalent to 0+. + assertTrimmedTo(".*0{1,}3", "\\+[a-zA-Z]E.*(0+)?3"); + + // Test 0{1,} with other zeros. + assertTrimmedTo(".*00{1,}03", "\\+[a-zA-Z]E.*(0{3,})?3"); + + // Test 0{1,5} with defined end bound. + assertTrimmedTo(".*0{1,5}3", "\\+[a-zA-Z]E.*(0{1,5})?3"); + + // Test 0{1,5} with other zeros. + assertTrimmedTo(".*00{1,5}03", "\\+[a-zA-Z]E.*(0{3,7})?3"); + + // Test 0{3,} with undefined end bound. + assertTrimmedTo(".*0{3,}3", "\\+[a-zA-Z]E.*(0{3,})?3"); + + // Test 0{3,} with other zeros. + assertTrimmedTo(".*00{3,}03", "\\+[a-zA-Z]E.*(0{5,})?3"); + } + } + + @Nested + class TrailingZeros { + + @Test + void testZerosWithoutQuantifiers() { + // Test trimming explicit zeros in pattern without decimal point. + assertTrimmedTo("345.*00", "\\+[c-z]E345.*"); + + // Test trimming explicit zeros in pattern after decimal point. + assertTrimmedTo("\\.34.*00", "\\+ZE34.*"); + + // Test trimming explicit zeros (including [0]). + assertTrimmedTo("34.*0[0][0]0", "\\+[b-z]E34.*"); + } + + @Test + void testZerosWithQuantifiers() { + // Test trimming zeros with * quantifier. + assertTrimmedTo("345.*0*", "\\+[c-z]E345.*"); + + // Test trimming zeros with + quantifier. + assertTrimmedTo("\\.34.*0+", "\\+ZE34.*"); + + // Test trimming zeros with {3} quantifier. + assertTrimmedTo("34.*0{3}", "\\+[b-z]E34.*"); + + // Test trimming zeros with {3,5} quantifier. + assertTrimmedTo("34.0{3,5}", "\\+[b-h]E34.?"); + + // Test that 0* with no other zeros. + assertTrimmedTo("340*", "\\+[b-z]E34"); + + // Test that 0+ with no other zeros. + assertTrimmedTo("340+", "\\+[c-z]E34"); + + // Test that 0* with other zeros. + assertTrimmedTo("3400*00", "\\+[e-z]E34"); + + // Test that 0+ with other zeros. + assertTrimmedTo("3400+00", "\\+[f-z]E34"); + + // Test that zeros with repetitions. + assertTrimmedTo("3400{3}0{2}", "\\+hE34"); + + // Test that {x} quantifiers after a possible trailing zero will include a possible length of zero. + assertTrimmedTo("[123678]3\\.[01789]{5}", "\\+bE[123678]3[01789]{0,5}"); + + // Test that {x}? quantifiers after a possible trailing zero will include a possible length of zero. + assertTrimmedTo("[124678]3\\.[01789]{5}?", "\\+bE[124678]3[01789]{0,5}?"); + + // Test that {x,y} quantifiers after a possible trailing zero will be made optional. + assertTrimmedTo("[124678]3\\.[01789]{3,9}", "\\+bE[124678]3([01789]{3,9})?"); + + // Test that {x,y}? quantifiers after a possible trailing zero will be made optional. + assertTrimmedTo("[124678]3\\.[01789]{3,9}?", "\\+bE[124678]3([01789]{3,9}?)?"); + } + + @Test + void testZerosBeforeWildcard() { + // Test trimming explicit zeros before a single wildcard. + assertTrimmedTo("23000.", "\\+[e-f]E23(0{3})?.?"); + + // Test trimming explicit zeros before .*. + assertTrimmedTo("23000.*", "\\+[e-z]E23(0{3})?.*"); + + // Test trimming explicit zeros before .+. Because .+ could be a number of zeros, we need to change it to .* to allow for the fact that trailing + // zeros would be trimmed in encoded numbers. + assertTrimmedTo("23000.+00.*", "\\+[e-z]E23(0{3})?.*(0{2})?.*"); + } + + @Test + void testZerosBeforeMultipleWildcards() { + // Test trimming explicit zeros before a single wildcard. + assertTrimmedTo("23.00.0.", "\\+[b-h]E23.?(0{2})?.?0?.?"); + + // Test trimming explicit zeros before .*. + assertTrimmedTo("23.*00.*0.*", "\\+[b-z]E23.*(0{2})?.*0?.*"); + + // Test trimming explicit zeros after .+. + assertTrimmedTo("23.+00.+0.+", "\\+[b-z]E23.*(0{2})?.*0?.*"); + } + + @Test + void testZerosBeforeDecimalPointWithPossibleAllTrailingZeros() { + assertTrimmedTo("3400\\.0000.", "\\+dE34(0{6})?.?"); + assertTrimmedTo("3400\\.0000.*", "\\+dE34(0{6})?.*"); + assertTrimmedTo("3400{3}\\.0000.*", "\\+fE34(0{8})?.*"); + assertTrimmedTo("34[012]0\\.0000.*", "\\+dE34[012]?(0{5})?.*"); + // The trailing .+ must become .* to allow for trimmed zeros. + assertTrimmedTo("3400\\.0000.+", "\\+dE34(0{6})?.*"); + } + + @Test + void testZerosBeforePossibleZeroCharacter() { + // The trailing .+ must become .* to allow for trimmed zeros. + assertTrimmedTo("2300000.+", "\\+[g-z]E23(0{5})?.*"); + + assertTrimmedTo("2300000.", "\\+[g-h]E23(0{5})?.?"); + assertTrimmedTo("2300000.*", "\\+[g-z]E23(0{5})?.*"); + assertTrimmedTo("2300000[0-9]", "\\+hE23(0{5})?[0-9]?"); + } + + @Test + void testZerosWithRepetitionRange() { + // Test 0{0,}, equivalent to 0* + assertTrimmedTo("3.*0{0,}[01]", "\\+[a-z]E3.*(0*)?[01]?"); + + // Test 0{0,} with other zeros. + assertTrimmedTo("3.*00{0,}0[01]", "\\+[a-z]E3.*(0{2,})?[01]?"); + + // Test 0{1,}, equivalent to 0+. + assertTrimmedTo("3.*0{1,}[01]", "\\+[a-z]E3.*(0+)?[01]?"); + + // Test 0{1,} with other zeros. + assertTrimmedTo("3.*00{1,}0[01]", "\\+[a-z]E3.*(0{3,})?[01]?"); + + // Test 0{1,5} with defined end bound. + assertTrimmedTo("3.*0{1,5}[01]", "\\+[a-z]E3.*(0{1,5})?[01]?"); + + // Test 0{1,5} with other zeros. + assertTrimmedTo("3.*00{1,5}0[01]", "\\+[a-z]E3.*(0{3,7})?[01]?"); + + // Test 0{3,} with undefined end bound. + assertTrimmedTo("3.*0{3,}[01]", "\\+[a-z]E3.*(0{3,})?[01]?"); + + // Test 0{3,} with other zeros. + assertTrimmedTo("3.*00{3,}0[01]0", "\\+[a-z]E3.*(0{5,})?[01]?"); + } + } + + @Test + void testNoLeadingOrTrailingZeros() { + assertTrimmedTo(".*344", "\\+[c-zA-Z]E.*344"); + assertTrimmedTo("45.*", "\\+[b-z]E45.*"); + assertTrimmedTo("300454.*", "\\+[f-z]E300454.*"); + assertTrimmedTo("300.*0003", "\\+[c-z]E300.*0003"); + assertTrimmedTo("300.*000[1-9]", "\\+[c-z]E300.*000[1-9]"); + + } + + @Test + void testSingleElementPatterns() { + assertTrimmedTo(".", "\\+aE."); + assertTrimmedTo(".*", "\\+[a-zA-Z]E.*"); + assertTrimmedTo(".*?", "\\+[a-zA-Z]E.*?"); + assertTrimmedTo(".+", "\\+[a-zA-Z]E.+"); + assertTrimmedTo(".+?", "\\+[a-zA-Z]E.+?"); + assertTrimmedTo("[14]", "\\+aE[14]"); + assertTrimmedTo("[14]{3}", "\\+cE[14]{3}"); + assertTrimmedTo("\\d", "\\+aE\\d"); + assertTrimmedTo("\\d{3}", "\\+[a-c]E\\d{3}"); + } + + @Test + void testStatus() { + ZeroRegexStatus status = ZeroRegexStatus.NONE; + assertStatus("300.*0003", status); + assertStatus("300.*000[1-9]", status); + assertStatus("45.*", status); + assertStatus("-45.*", status); + + status = ZeroRegexStatus.LEADING; + assertStatus(".*", status); + assertStatus(".*?", status); + assertStatus(".*?11", status); + assertStatus("[04][05][06]", status); + assertStatus("[04]{1,3}[05][06]", status); + assertStatus("\\d{3}", status); + assertStatus(".\\.000034.*", status); + assertStatus("00345.*", status); + assertStatus("\\.000034.*", status); + assertStatus("-00345.*", status); + + status = ZeroRegexStatus.TRAILING; + assertStatus("3.*0{0,}[01]", status); + assertStatus("3.*?0{0,}[01]", status); + assertStatus("3400\\.0000.", status); + assertStatus("340.*", status); + assertStatus("340.*?", status); + assertStatus("3400{3}0{2}", status); + + } + + @Test + void testTrailingZerosWithoutQuantifiers() { + assertTrimmedTo(".*34300", "\\+[e-zA-Z]E.*343"); + } + + @Test + void testNegativeNumber() { + assertTrimmedTo("-0.00454.*", "![A-Xc]E.?(0{2})?454.*"); + } + + @Test + void testMixedAlternation() { + assertTrimmedTo("234\\.45|343.*|0\\.00[0]34.*", "\\+cE2\\.3445|\\+[c-z]E343.*|\\+WE34.*"); + } + + private void assertStatus(String pattern, ZeroRegexStatus status) { + Assert.equals(ZeroTrimmer.getStatus(RegexParser.parse(pattern).getChildren()), status); + } + + private void assertTrimmedTo(String pattern, String expectedPattern) { + Node actual = SimpleNumberEncoder.encode(parse(pattern)); + actual = ExponentialBinAdder.addBins(actual); + actual = ZeroTrimmer.trim(actual); + assertThat(actual).asTreeString().isEqualTo(expectedPattern); + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/ZeroValueNormalizerTest.java b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/ZeroValueNormalizerTest.java new file mode 100644 index 00000000000..f919a90a3a2 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/normalizer/regex/visitor/ZeroValueNormalizerTest.java @@ -0,0 +1,85 @@ +package datawave.data.normalizer.regex.visitor; + +import static datawave.data.normalizer.regex.NodeAssert.assertThat; +import static datawave.data.normalizer.regex.RegexParser.parse; + +import org.junit.jupiter.api.Test; + +import datawave.data.normalizer.regex.Node; + +class ZeroValueNormalizerTest { + + /** + * Test different variants of zero and negative zero. These can be handled by {@link datawave.data.type.util.NumericalEncoder#encode(String)} and do not + * need to be changed. + */ + @Test + public void testSimpleNumberZeros() { + assertNotExpanded("0"); + assertNotExpanded("0\\.00"); + assertNotExpanded("-0"); + assertNotExpanded("-0\\.00"); + } + + @Test + void testPositivePatternsThatCanMatchZero() { + assertExpanded("0.*", "0.*|0"); + assertExpanded(".*0", ".*0|0"); + assertExpanded(".+0", ".+0|0"); + assertExpanded("[0-9]", "[0-9]|0"); + assertExpanded(".*0.*", ".*0.*|0"); + assertExpanded("0.", "0.|0"); + assertExpanded("0\\d", "0\\d|0"); + assertExpanded("\\d", "\\d|0"); + } + + @Test + void testPositivePatternsThatOnlyMatchZero() { + assertExpanded("0\\.0[0]", "0"); + assertExpanded("0\\.0[0-0]", "0"); + } + + @Test + void testNegativePatternsThatOnlyMatchZero() { + assertExpanded("-0\\.0[0]", "0"); + assertExpanded("-0\\.0[0-0]", "0"); + } + + @Test + void testNegativePatternsThatCanMatchZero() { + assertExpanded("-[01234]", "-[01234]|0"); + assertExpanded("-[0-9]", "-[0-9]|0"); + assertExpanded("-\\d", "-\\d|0"); + assertExpanded("-.", "-.|0"); + assertExpanded("-.*", "-.*|0"); + assertExpanded("-.+", "-.+|0"); + assertExpanded("-0.00.*", "-0.00.*|0"); + assertExpanded("-0.00.*", "-0.00.*|0"); + assertExpanded("-0.00\\d", "-0.00\\d|0"); + assertExpanded("-00\\.0\\d.", "-00\\.0\\d.|0"); + assertExpanded("-[0-3]0\\d.", "-[0-3]0\\d.|0"); + } + + @Test + void testNegativePatternsThatCannotMatchZero() { + assertNotExpanded("-234[0-3]"); + assertNotExpanded("-.*834"); + assertNotExpanded("-0\\.00.*834"); + assertNotExpanded("-.00.001"); + } + + @Test + void testAlternations() { + assertExpanded("0\\.93|34.*|-34.*|0\\.0[0]|-0.00\\d", "0\\.93|34.*|-34.*|0|-0.00\\d|0"); + } + + public void assertNotExpanded(String pattern) { + assertExpanded(pattern, pattern); + } + + public void assertExpanded(String pattern, String expectedPattern) { + Node actual = ZeroValueNormalizer.expand(parse(pattern)); + Node expected = parse(expectedPattern); + assertThat(actual).isEqualTreeTo(expected); + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/parser/WKBParserTest.java b/core/utils/type-utils/src/test/java/datawave/data/parser/WKBParserTest.java new file mode 100644 index 00000000000..1e1a60e761d --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/parser/WKBParserTest.java @@ -0,0 +1,52 @@ +package datawave.data.parser; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.junit.jupiter.api.Test; +import org.locationtech.jts.geom.Geometry; +import org.locationtech.jts.io.WKBWriter; +import org.locationtech.jts.io.WKTReader; + +import com.google.common.io.BaseEncoding; + +public class WKBParserTest { + + @Test + public void testParsePoint() throws Exception { + Geometry geom = new WKTReader().read("POINT(10 20)"); + + String base64EncodedWkb = BaseEncoding.base64().encode(new WKBWriter().write(geom)); + + WKBParser wkbParser = new WKBParser(); + + Geometry parsedGeom = wkbParser.parseGeometry(base64EncodedWkb); + + assertTrue(geom.equals(parsedGeom)); + } + + @Test + public void testParseLine() throws Exception { + Geometry geom = new WKTReader().read("LINESTRING (30 10, 10 30, 40 40)"); + + String base64EncodedWkb = BaseEncoding.base64().encode(new WKBWriter().write(geom)); + + WKBParser wkbParser = new WKBParser(); + + Geometry parsedGeom = wkbParser.parseGeometry(base64EncodedWkb); + + assertTrue(geom.equals(parsedGeom)); + } + + @Test + public void testParsePolygon() throws Exception { + Geometry geom = new WKTReader().read("POLYGON((10 10, 20 20, 30 10, 10 10))"); + + String base64EncodedWkb = BaseEncoding.base64().encode(new WKBWriter().write(geom)); + + WKBParser wkbParser = new WKBParser(); + + Geometry parsedGeom = wkbParser.parseGeometry(base64EncodedWkb); + + assertTrue(geom.equals(parsedGeom)); + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/type/GeometryObjectSizeTest.java b/core/utils/type-utils/src/test/java/datawave/data/type/GeometryObjectSizeTest.java new file mode 100644 index 00000000000..ca54018a788 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/type/GeometryObjectSizeTest.java @@ -0,0 +1,110 @@ +package datawave.data.type; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.List; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.locationtech.jts.io.WKTReader; + +import datawave.data.type.util.Geometry; +import datawave.data.type.util.Point; +import datawave.webservice.query.data.ObjectSizeOf; + +public class GeometryObjectSizeTest { + + private final static double THRESHOLD = 0.1; + + @Test + public void pointTest() throws Exception { + PointType pointType = new PointType(); + pointType.setDelegate(new Point((org.locationtech.jts.geom.Point) new WKTReader().read("POINT(0 0)"))); + + long estimatedSizeInBytes = pointType.sizeInBytes(); + long actualSizeInBytes = sizeInBytes(pointType); + + assertTrue(Math.abs(actualSizeInBytes - estimatedSizeInBytes) / (double) actualSizeInBytes <= THRESHOLD); + } + + @Test + public void multiPointTest() throws Exception { + GeometryType geometryType = new GeometryType(); + geometryType.setDelegate(new Geometry(new WKTReader().read("MULTIPOINT(0 0, 1 1, 2 2, 3 3, 4 4, 5 5, 6 6, 7 7, 8 8, 9 9, 10 10)"))); + + long estimatedSizeInBytes = geometryType.sizeInBytes(); + long actualSizeInBytes = sizeInBytes(geometryType); + + assertTrue(Math.abs(actualSizeInBytes - estimatedSizeInBytes) / (double) actualSizeInBytes <= THRESHOLD); + } + + @Test + public void polygonTest() throws Exception { + GeometryType geometryType = new GeometryType(); + geometryType.setDelegate(new Geometry( + new WKTReader().read("POLYGON((-180 -90, 180 -90, 180 90, -180 90, -180 -90), (-45 -45, 45 -45, 45 45, -45 45, -45 -45))"))); + + long estimatedSizeInBytes = geometryType.sizeInBytes(); + long actualSizeInBytes = sizeInBytes(geometryType); + + assertTrue(Math.abs(actualSizeInBytes - estimatedSizeInBytes) / (double) actualSizeInBytes <= THRESHOLD); + } + + @Test + public void multiPolygonTest() throws Exception { + GeometryType geometryType = new GeometryType(); + geometryType.setDelegate(new Geometry(new WKTReader().read( + "MULTIPOLYGON(((-180 -90, 180 -90, 180 90, -180 90, -180 -90), (-45 -45, 45 -45, 45 45, -45 45, -45 -45)), ((-60 -60, 60 -60, 60 60, -60 60, -60 -60)))"))); + + long estimatedSizeInBytes = geometryType.sizeInBytes(); + long actualSizeInBytes = sizeInBytes(geometryType); + + assertTrue(Math.abs(actualSizeInBytes - estimatedSizeInBytes) / (double) actualSizeInBytes <= THRESHOLD); + } + + @Test + public void lineStringTest() throws Exception { + GeometryType geometryType = new GeometryType(); + geometryType.setDelegate(new Geometry(new WKTReader().read("LINESTRING(-110 -80, -45 -76, -10 -5, 30 10, 40 50, 35 30, 170 85)"))); + + long estimatedSizeInBytes = geometryType.sizeInBytes(); + long actualSizeInBytes = sizeInBytes(geometryType); + + assertTrue(Math.abs(actualSizeInBytes - estimatedSizeInBytes) / (double) actualSizeInBytes <= THRESHOLD); + } + + @Test + public void multiLineStringTest() throws Exception { + GeometryType geometryType = new GeometryType(); + geometryType.setDelegate(new Geometry(new WKTReader().read( + "MULTILINESTRING((-110 -80, -45 -76, -10 -5, 30 10, 40 50, 35 30, 170 85), (0 1, 0 2, 0 3, 0 4, 0 5, 0 6, 0 7, 1 8, 1 9, 1 10))"))); + + long estimatedSizeInBytes = geometryType.sizeInBytes(); + long actualSizeInBytes = sizeInBytes(geometryType); + + assertTrue(Math.abs(actualSizeInBytes - estimatedSizeInBytes) / (double) actualSizeInBytes <= THRESHOLD); + } + + @Test + public void geometryCollectionTest() throws Exception { + GeometryType geometryType = new GeometryType(); + geometryType.setDelegate(new Geometry(new WKTReader().read( + "GEOMETRYCOLLECTION(POINT(0 0), MULTIPOINT(0 0, 1 1, 2 2, 3 3, 4 4, 5 5, 6 6, 7 7, 8 8, 9 9, 10 10), POLYGON((-180 -90, 180 -90, 180 90, -180 90, -180 -90), (-45 -45, 45 -45, 45 45, -45 45, -45 -45)), MULTIPOLYGON(((-180 -90, 180 -90, 180 90, -180 90, -180 -90), (-45 -45, 45 -45, 45 45, -45 45, -45 -45)), ((-60 -60, 60 -60, 60 60, -60 60, -60 -60))), LINESTRING(-110 -80, -45 -76, -10 -5, 30 10, 40 50, 35 30, 170 85), MULTILINESTRING((-110 -80, -45 -76, -10 -5, 30 10, 40 50, 35 30, 170 85), (0 1, 0 2, 0 3, 0 4, 0 5, 0 6, 0 7, 1 8, 1 9, 1 10)), GEOMETRYCOLLECTION(POINT(0 0), MULTIPOINT(0 0, 1 1, 2 2, 3 3, 4 4, 5 5, 6 6, 7 7, 8 8, 9 9, 10 10), POLYGON((-180 -90, 180 -90, 180 90, -180 90, -180 -90), (-45 -45, 45 -45, 45 45, -45 45, -45 -45)), MULTIPOLYGON(((-180 -90, 180 -90, 180 90, -180 90, -180 -90), (-45 -45, 45 -45, 45 45, -45 45, -45 -45)), ((-60 -60, 60 -60, 60 60, -60 60, -60 -60))), LINESTRING(-110 -80, -45 -76, -10 -5, 30 10, 40 50, 35 30, 170 85), MULTILINESTRING((-110 -80, -45 -76, -10 -5, 30 10, 40 50, 35 30, 170 85), (0 1, 0 2, 0 3, 0 4, 0 5, 0 6, 0 7, 1 8, 1 9, 1 10))))"))); + + long estimatedSizeInBytes = geometryType.sizeInBytes(); + long actualSizeInBytes = sizeInBytes(geometryType); + + assertTrue(Math.abs(actualSizeInBytes - estimatedSizeInBytes) / (double) actualSizeInBytes <= THRESHOLD); + } + + private static long sizeInBytes(BaseType baseType) { + long size = 0; + if (baseType instanceof OneToManyNormalizerType) { + List values = ((OneToManyNormalizerType) baseType).getNormalizedValues(); + size += values.stream().map(String::length).map(length -> 2 * length + ObjectSizeOf.Sizer.REFERENCE).reduce(Integer::sum).orElse(0); + } + size += ObjectSizeOf.PrecomputedSizes.STRING_STATIC_REF + ObjectSizeOf.Sizer.REFERENCE + ObjectSizeOf.Sizer.REFERENCE + + (2 * baseType.getNormalizedValue().length()) + ObjectSizeOf.Sizer.getObjectSize(baseType.getDelegate()); + return size; + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/type/IpAddressTypeTest.java b/core/utils/type-utils/src/test/java/datawave/data/type/IpAddressTypeTest.java new file mode 100644 index 00000000000..9886010047b --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/type/IpAddressTypeTest.java @@ -0,0 +1,98 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + +package datawave.data.type; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import org.apache.log4j.Logger; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +import datawave.data.type.util.IpV4Address; + +public class IpAddressTypeTest { + private static Logger log = Logger.getLogger(IpAddressTypeTest.class); + + @Test + public void testIpNormalizer01() { + String ip = "1.2.3.4"; + String expected = "001.002.003.004"; + IpAddressType norm = new IpAddressType(); + String result = norm.normalize(ip); + assertEquals(expected, result); + log.debug("result: " + result); + } + + @Test + public void testIpNormalizer02() { + String ip = "1.2.3"; + IpAddressType norm = new IpAddressType(); + assertThrows(IllegalArgumentException.class, () -> norm.normalize(ip)); + } + + @Test + public void testIpNormalizer03() { + IpAddressType norm = new IpAddressType(); + if (log.isDebugEnabled()) { + log.debug("testIpNormalizer03"); + log.debug(norm.normalize("1.2.3.*")); + log.debug(norm.normalize("1.2.3..*")); + log.debug(norm.normalize("1.2.*")); + log.debug(norm.normalize("1.2..*")); + log.debug(norm.normalize("1.*")); + log.debug(norm.normalize("1..*")); + + } + assertEquals("001.002.003.*", norm.normalize("1.2.3.*")); + assertEquals("001.002.003.*", norm.normalize("1.2.3..*")); + assertEquals("001.002.*", norm.normalize("1.2.*")); + assertEquals("001.002.*", norm.normalize("1.2..*")); + assertEquals("001.*", norm.normalize("1.*")); + assertEquals("001.*", norm.normalize("1..*")); + } + + @Test + public void testIpNormalizer04() { + log.debug("testIpNormalizer04"); + IpAddressType norm = new IpAddressType(); + log.debug(norm.normalize("*.2.13.4")); + log.debug(norm.normalize("*.13.4")); + assertEquals("*.002.013.004", norm.normalize("*.2.13.4")); + assertEquals("*.013.004", norm.normalize("*.13.4")); + } + + // TEST IS TURNED OFF + @Test + @Disabled + public void testIpNormalizer05() { + log.debug("testIpNormalizer05"); + IpV4Address ip = IpV4Address.parse("*.2.13.4"); + if (log.isDebugEnabled()) { + log.debug(ip.toString()); + log.debug(ip.toZeroPaddedString()); + log.debug(ip.toReverseString()); + log.debug(ip.toReverseZeroPaddedString()); + } + } + + /* + * NOTE: call toReverseString() on a wildcarded ip doesn't work right although this is not much of an issue. + */ + // TEST IS TURNED OFF + @Test + @Disabled + public void testIpNormalizer06() { + log.debug("testIpNormalizer06"); + IpV4Address ip = IpV4Address.parse("1.2.*"); + if (log.isDebugEnabled()) { + log.debug(ip.toString()); + log.debug(ip.toZeroPaddedString()); + log.debug(ip.toReverseString()); + log.debug(ip.toReverseZeroPaddedString()); + } + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/type/LcNoDiacriticsTypeTest.java b/core/utils/type-utils/src/test/java/datawave/data/type/LcNoDiacriticsTypeTest.java new file mode 100644 index 00000000000..1f47d03351c --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/type/LcNoDiacriticsTypeTest.java @@ -0,0 +1,21 @@ +package datawave.data.type; + +import static org.junit.jupiter.api.Assertions.assertNull; + +import org.junit.jupiter.api.Test; + +/** + * + * + */ +public class LcNoDiacriticsTypeTest { + @Test + public void test1() { + LcNoDiacriticsType norm = new LcNoDiacriticsType(); + String b = null; + String n1 = norm.normalize(b); + + assertNull(n1); + + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/type/ListTypeTest.java b/core/utils/type-utils/src/test/java/datawave/data/type/ListTypeTest.java new file mode 100644 index 00000000000..64bb59b7d43 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/type/ListTypeTest.java @@ -0,0 +1,53 @@ +package datawave.data.type; + +import java.util.Arrays; +import java.util.List; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.locationtech.jts.util.Assert; + +public class ListTypeTest { + + @Test + public void test() { + String str = "1,2,3;a;b;c"; + + LcNoDiacriticsListType t = new LcNoDiacriticsListType(str); + Assert.equals(6, t.normalizeToMany(str).size()); + List expected = Arrays.asList(new String[] {"1", "2", "3", "a", "b", "c"}); + Assert.equals(expected, t.normalizeToMany(str)); + } + + @Test + public void testLcNDList() { + String str = "01,02,03;A;B;C"; + + LcNoDiacriticsListType t = new LcNoDiacriticsListType(); + Assert.equals(6, t.normalizeToMany(str).size()); + List expected = Arrays.asList(new String[] {"01", "02", "03", "a", "b", "c"}); + Assert.equals(expected, t.normalizeToMany(str)); + } + + @Test + public void testNumberList() { + String str = "1,2,3,5.5"; + List expected = Arrays.asList(new String[] {"+aE1", "+aE2", "+aE3", "+aE5.5"}); + + NumberListType nt = new NumberListType(); + Assert.equals(4, nt.normalizeToMany(str).size()); + Assert.equals(expected, nt.normalizeToMany(str)); + } + + @Test + public void testBadNumberList() { + String str = "3,2,1,banana"; + + NumberListType nt = new NumberListType(); + Assertions.assertThrows(IllegalArgumentException.class, () -> { + nt.normalizeToMany(str); + }); + + } + +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/type/TypeFactoryTest.java b/core/utils/type-utils/src/test/java/datawave/data/type/TypeFactoryTest.java new file mode 100644 index 00000000000..9f18656ed60 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/type/TypeFactoryTest.java @@ -0,0 +1,116 @@ +package datawave.data.type; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import static org.junit.jupiter.api.Assertions.assertNotSame; +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.util.List; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +public class TypeFactoryTest { + + private TypeFactory typeFactory; + + @BeforeEach + public void before() { + typeFactory = new TypeFactory(); + } + + @Test + public void testWithCorrectType() { + Type type = Type.Factory.createType("datawave.data.type.LcType"); + assertInstanceOf(LcType.class, type); + } + + @Test + public void testWithIncorrectType() { + assertThrows(IllegalArgumentException.class, () -> Type.Factory.createType("datawave.ingest.data.normalizer.LcNoDiacriticsNormalizer")); + } + + @Test + public void testTypeFactoryWithCache() { + TypeFactory factory = new TypeFactory(); + + Type typeOne = factory.createType(LcType.class.getName()); + Type typeTwo = factory.createType(LcType.class.getName()); + + assertSame(typeOne, typeTwo); + } + + @Test + public void testTypeFactoryCustomSize() { + TypeFactory factory = new TypeFactory(1, 15); + + Type typeOne = factory.createType(LcType.class.getName()); + Type typeTwo = factory.createType(IpAddressType.class.getName()); + Type typeThree = factory.createType(IpAddressType.class.getName()); + Type typeFour = factory.createType(LcType.class.getName()); + + // same type created in a row with a cache size of one will return the same type instance + assertSame(typeTwo, typeThree); + + // same type created with other types between will return different instances + assertNotSame(typeOne, typeFour); + + assertEquals(1, factory.getCacheSize()); + } + + @Test + public void testAllTypesAllFactories() { + // AbstractGeometryType, BaseType and ListType are technically all abstract types and cannot be created + + // @formatter:off + List typeClassNames = List.of(DateType.class.getName(), + GeoLatType.class.getName(), + GeoLonType.class.getName(), + GeometryType.class.getName(), + GeoType.class.getName(), + HexStringType.class.getName(), + HitTermType.class.getName(), + IpAddressType.class.getName(), + IpV4AddressType.class.getName(), + LcNoDiacriticsListType.class.getName(), + LcNoDiacriticsType.class.getName(), + LcType.class.getName(), + MacAddressType.class.getName(), + NoOpType.class.getName(), + NumberListType.class.getName(), + NumberType.class.getName(), + PointType.class.getName(), + RawDateType.class.getName(), + StringType.class.getName(), + TrimLeadingZerosType.class.getName()); + // @formatter:on + + for (String typeClassName : typeClassNames) { + assertTypeCreation(typeClassName); + } + + assertEquals(20, typeFactory.getCacheSize()); + } + + /** + * Assert that the same Type is created via the internal {@link Type.Factory} and the {@link TypeFactory}. + *

+ * Also asserts that multiple calls to {@link TypeFactory#createType(String)} return the same instance. + * + * @param typeClassName + * the class name for a Type + */ + private void assertTypeCreation(String typeClassName) { + Type internalCreate = Type.Factory.createType(typeClassName); + + Type factoryCreateOne = typeFactory.createType(typeClassName); + Type factoryCreateTwo = typeFactory.createType(typeClassName); + + assertSame(factoryCreateOne, factoryCreateTwo, "TypeFactory should have returned the same instance"); + + assertNotSame(internalCreate, factoryCreateOne, "Type.Factory and TypeFactory should have returned different instances"); + assertNotSame(internalCreate, factoryCreateTwo, "Type.Factory and TypeFactory should have returned different instances"); + } + +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/type/util/IpV6AddressTypeTest.java b/core/utils/type-utils/src/test/java/datawave/data/type/util/IpV6AddressTypeTest.java new file mode 100644 index 00000000000..ad9dee6397f --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/type/util/IpV6AddressTypeTest.java @@ -0,0 +1,56 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + +package datawave.data.type.util; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.apache.log4j.Logger; +import org.junit.jupiter.api.Test; + +/** + * + */ +public class IpV6AddressTypeTest { + private static Logger log = Logger.getLogger(IpV6AddressTypeTest.class); + + private final String[] in = { // + "2001:0db8:0000:0000:0000:ff00:0042:8329", // + "2003:DEAD:BEEF:4DAD:23:46:bb:101", // + "2000:FFFF:EEEE:DD:CC:0000:0000:0000", // + "AAAA:BBBB:CCCC:DDDD:EEEE:FFFF:2222:0", // + "ff02:0b00:0000:0000:0001:0000:0000:000a", // + "0000:0000:0000:0000:0000:0000:0000:0001", // + "0000:0000:0000:0000:0000:0000:0000:0000", // + "0001:0000:0000:0000:0000:0000:0000:0000", // + }; + private final String[] out = { // + "2001:db8::ff00:42:8329", // + "2003:dead:beef:4dad:23:46:bb:101", // + "2000:ffff:eeee:dd:cc:", // + "aaaa:bbbb:cccc:dddd:eeee:ffff:2222:", // + "ff02:b00::0001:0:0:a", // + "::1", // + "::", // + "1::" + + }; + + /** + * Take a valid IpV6Address string, parse it to an IpV6Address instance, take the toString value from the IpV6Address and parse that into another + * IpV6Address instance. Make sure that the toString for the original and reparsed addresses match. + */ + @Test + public void testIpNormalizer01() { + + for (String address : in) { + IpV6Address addr = IpV6Address.parse(address); + log.debug(address + " parsed to: " + addr); + IpV6Address reparsed = IpV6Address.parse(addr.toString()); + log.debug(address + " parsed to: " + addr + " and re-parsed to " + reparsed); + assertEquals(reparsed.toString(), addr.toString(), "Had a problem re-parsing " + address); + } + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/data/type/util/NumericalEncoderTest.java b/core/utils/type-utils/src/test/java/datawave/data/type/util/NumericalEncoderTest.java new file mode 100644 index 00000000000..b98fc3589cb --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/data/type/util/NumericalEncoderTest.java @@ -0,0 +1,87 @@ +package datawave.data.type.util; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Random; + +import org.junit.jupiter.api.Test; + +public class NumericalEncoderTest { + + @Test + public void testIsPossiblyEncoded() { + assertFalse(NumericalEncoder.isPossiblyEncoded(null)); + assertFalse(NumericalEncoder.isPossiblyEncoded("")); + assertFalse(NumericalEncoder.isPossiblyEncoded("1")); + assertFalse(NumericalEncoder.isPossiblyEncoded("+1")); + assertFalse(NumericalEncoder.isPossiblyEncoded("!1")); + assertTrue(NumericalEncoder.isPossiblyEncoded("+aE5.4")); + assertTrue(NumericalEncoder.isPossiblyEncoded("+ae5.4")); + assertFalse(NumericalEncoder.isPossiblyEncoded("+aE5.4.4.4.4")); + assertTrue(NumericalEncoder.isPossiblyEncoded("+AE0")); + assertFalse(NumericalEncoder.isPossiblyEncoded("+AE0..")); + assertFalse(NumericalEncoder.isPossiblyEncoded(Long.valueOf(Long.MAX_VALUE).toString())); + assertTrue(NumericalEncoder.isPossiblyEncoded(NumericalEncoder.encode(".0005"))); + assertTrue(NumericalEncoder.isPossiblyEncoded(NumericalEncoder.encode("1"))); + assertTrue(NumericalEncoder.isPossiblyEncoded(NumericalEncoder.encode("5"))); + assertTrue(NumericalEncoder.isPossiblyEncoded(NumericalEncoder.encode("1000"))); + assertTrue(NumericalEncoder.isPossiblyEncoded(NumericalEncoder.encode("1001"))); + assertTrue(NumericalEncoder.isPossiblyEncoded(NumericalEncoder.encode("10001"))); + assertTrue(NumericalEncoder.isPossiblyEncoded(NumericalEncoder.encode("100001"))); + assertTrue(NumericalEncoder.isPossiblyEncoded(NumericalEncoder.encode("1000001"))); + assertTrue(NumericalEncoder.isPossiblyEncoded(NumericalEncoder.encode("100000001"))); + assertTrue(NumericalEncoder.isPossiblyEncoded(NumericalEncoder.encode("100000008"))); + assertTrue(NumericalEncoder.isPossiblyEncoded(NumericalEncoder.encode("-.0005"))); + assertTrue(NumericalEncoder.isPossiblyEncoded(NumericalEncoder.encode("-1"))); + assertTrue(NumericalEncoder.isPossiblyEncoded(NumericalEncoder.encode("-5"))); + assertTrue(NumericalEncoder.isPossiblyEncoded(NumericalEncoder.encode("-1000"))); + assertTrue(NumericalEncoder.isPossiblyEncoded(NumericalEncoder.encode("-1001"))); + assertTrue(NumericalEncoder.isPossiblyEncoded(NumericalEncoder.encode("-10001"))); + assertTrue(NumericalEncoder.isPossiblyEncoded(NumericalEncoder.encode("-100001"))); + assertTrue(NumericalEncoder.isPossiblyEncoded(NumericalEncoder.encode("-1000001"))); + assertTrue(NumericalEncoder.isPossiblyEncoded(NumericalEncoder.encode("-100000001"))); + assertTrue(NumericalEncoder.isPossiblyEncoded(NumericalEncoder.encode("-100000008"))); + + } + + @Test + public void testEncode() { + assertEquals("+aE5", NumericalEncoder.encode("5")); + assertEquals("+aE6", NumericalEncoder.encode("6")); + assertEquals("+dE1", NumericalEncoder.encode("1000")); + assertEquals("+dE1.001", NumericalEncoder.encode("1001")); + assertEquals("+eE1.0001", NumericalEncoder.encode("10001")); + assertEquals("+fE1.00001", NumericalEncoder.encode("100001")); + assertEquals("+gE1.000001", NumericalEncoder.encode("1000001")); + assertEquals("+iE1.00000001", NumericalEncoder.encode("100000001")); + assertEquals("+iE1.00000008", NumericalEncoder.encode("100000008")); + } + + @Test + public void testDecode() { + for (long i = 0; i < 10000; i++) { + assertEquals(i, NumericalEncoder.decode(NumericalEncoder.encode(Long.valueOf(i).toString())).longValue()); + } + + } + + @Test + public void testDecodeBigNums() { + for (long i = 5; i < Long.MAX_VALUE; i *= 1.0002) { + assertEquals(i, NumericalEncoder.decode(NumericalEncoder.encode(Long.valueOf(i).toString())).longValue()); + i++; + } + } + + @Test + public void testDecodeBigNumsRandomIncrement() { + int increment = new Random().nextInt(9) + 1; + for (long i = 1; i < Long.MAX_VALUE; i *= 1.0002) { + assertEquals(i, NumericalEncoder.decode(NumericalEncoder.encode(Long.valueOf(i).toString())).longValue()); + i += increment; + } + + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/query/parser/JavaRegexAnalyzerTest.java b/core/utils/type-utils/src/test/java/datawave/query/parser/JavaRegexAnalyzerTest.java new file mode 100644 index 00000000000..bb563a4f435 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/query/parser/JavaRegexAnalyzerTest.java @@ -0,0 +1,1169 @@ +package datawave.query.parser; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +import com.google.common.base.Stopwatch; +import com.google.common.collect.Lists; + +import datawave.query.parser.JavaRegexAnalyzer.JavaRegexParseException; + +/** + * + * + */ +public class JavaRegexAnalyzerTest { + + private static final Logger log = Logger.getLogger(JavaRegexAnalyzerTest.class); + + @BeforeAll + public static void setUpClass() { + Logger.getRootLogger().setLevel(Level.OFF); + } + + @AfterAll + public static void tearDownClass() {} + + @BeforeEach + public void setUp() { + log.setLevel(Level.OFF); + Logger.getLogger(JavaRegexAnalyzer.class).setLevel(Level.OFF); + } + + @AfterEach + public void tearDown() {} + + public void enableLogging() { + log.setLevel(Level.DEBUG); + Logger.getLogger(JavaRegexAnalyzer.class).setLevel(Level.TRACE); + } + + @Test + public void testRegexAnalyzer01() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer01"); + String value = "abc.xyz"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("abc", wcd.getLeadingOrTrailingLiteral()); + assertEquals("abc", wcd.getLeadingLiteral()); + assertEquals("xyz", wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer02() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer02"); + String value = "abc\\.xyz"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + assertFalse(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("abc.xyz", wcd.getLeadingOrTrailingLiteral()); + assertEquals("abc.xyz", wcd.getLeadingLiteral()); + assertEquals("abc.xyz", wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer03() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer03"); + String value = "abcxy.*z"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("abcxy", wcd.getLeadingOrTrailingLiteral()); + assertEquals("abcxy", wcd.getLeadingLiteral()); + assertEquals("z", wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer04() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer04"); + String value = "abc\\.\\*xyz"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + assertFalse(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("abc.*xyz", wcd.getLeadingOrTrailingLiteral()); + assertEquals("abc.*xyz", wcd.getLeadingLiteral()); + assertEquals("abc.*xyz", wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer05() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer05"); + String value = "abcxy.*?"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertTrue(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertFalse(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("abcxy", wcd.getLeadingOrTrailingLiteral()); + assertEquals("abcxy", wcd.getLeadingLiteral()); + assertNull(wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer06() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer06"); + String value = "abcxyz\\.\\*\\?"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + assertFalse(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("abcxyz.*?", wcd.getLeadingOrTrailingLiteral()); + assertEquals("abcxyz.*?", wcd.getLeadingLiteral()); + assertEquals("abcxyz.*?", wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer07() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer07"); + String value = "abc.xyz.*?"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertTrue(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertFalse(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("abc", wcd.getLeadingOrTrailingLiteral()); + assertEquals("abc", wcd.getLeadingLiteral()); + assertNull(wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer08() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer08"); + String value = "abc.*xyz.*?"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertTrue(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertFalse(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("abc", wcd.getLeadingOrTrailingLiteral()); + assertEquals("abc", wcd.getLeadingLiteral()); + assertNull(wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer09() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer09"); + String value = "abc\\.\\*xyz.*?"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertTrue(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertFalse(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("abc.*xyz", wcd.getLeadingOrTrailingLiteral()); + assertEquals("abc.*xyz", wcd.getLeadingLiteral()); + assertNull(wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer10() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer10"); + String value = ".*something\\.com"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertTrue(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertFalse(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("something.com", wcd.getLeadingOrTrailingLiteral()); + assertNull(wcd.getLeadingLiteral()); + assertEquals("something.com", wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer11() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer11"); + String value = "something.com"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("something", wcd.getLeadingOrTrailingLiteral()); + assertEquals("something", wcd.getLeadingLiteral()); + assertEquals("com", wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer12() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer12"); + String value = "something\\.com"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertFalse(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("something.com", wcd.getLeadingOrTrailingLiteral()); + assertEquals("something.com", wcd.getLeadingLiteral()); + assertEquals("something.com", wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer13() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer13"); + String value = ".*dude.*"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertTrue(wcd.isLeadingRegex()); + assertTrue(wcd.isTrailingRegex()); + assertFalse(wcd.isLeadingLiteral()); + assertFalse(wcd.isTrailingLiteral()); + assertTrue(wcd.isNgram()); + assertNull(wcd.getLeadingOrTrailingLiteral()); + assertNull(wcd.getLeadingLiteral()); + assertNull(wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer14() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer14"); + String value = ".*dude.*com"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertTrue(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertFalse(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("com", wcd.getLeadingOrTrailingLiteral()); + assertNull(wcd.getLeadingLiteral()); + assertEquals("com", wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer15() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer15"); + String value = ".*dude.*com\\."; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertTrue(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertFalse(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("com.", wcd.getLeadingOrTrailingLiteral()); + assertNull(wcd.getLeadingLiteral()); + assertEquals("com.", wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer16() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer16"); + String value = "128\\.0\\.1\\.16"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertFalse(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("128.0.1.16", wcd.getLeadingOrTrailingLiteral()); + assertEquals("128.0.1.16", wcd.getLeadingLiteral()); + assertEquals("128.0.1.16", wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer17() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer17"); + String value = "128\\.0\\.1\\..*"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertTrue(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertFalse(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("128.0.1.", wcd.getLeadingOrTrailingLiteral()); + assertEquals("128.0.1.", wcd.getLeadingLiteral()); + assertNull(wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer17_1() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer17"); + String value = "128\\.0\\.1\\..*?"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertTrue(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertFalse(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("128.0.1.", wcd.getLeadingOrTrailingLiteral()); + assertEquals("128.0.1.", wcd.getLeadingLiteral()); + assertNull(wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer19() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer19"); + String value = "\\[I=2077c64e4eb655.*"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertTrue(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertFalse(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("[I=2077c64e4eb655", wcd.getLeadingOrTrailingLiteral()); + assertEquals("[I=2077c64e4eb655", wcd.getLeadingLiteral()); + assertNull(wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer20() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer20"); + String value = "\\\\\\\\some\\\\\\\\file\\\\\\\\path.*"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertTrue(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertFalse(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("\\\\some\\\\file\\\\path", wcd.getLeadingOrTrailingLiteral()); + assertEquals("\\\\some\\\\file\\\\path", wcd.getLeadingLiteral()); + assertNull(wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer21() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer21"); + String value = "bla?"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertTrue(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertFalse(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("bl", wcd.getLeadingOrTrailingLiteral()); + assertEquals("bl", wcd.getLeadingLiteral()); + assertNull(wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer22() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer22"); + String value = "bla*"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertTrue(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertFalse(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("bl", wcd.getLeadingOrTrailingLiteral()); + assertEquals("bl", wcd.getLeadingLiteral()); + assertNull(wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer23() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer23"); + String value = "bla+"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("bla", wcd.getLeadingOrTrailingLiteral()); + assertEquals("bla", wcd.getLeadingLiteral()); + assertEquals("a", wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer24() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer24"); + String value = "bla{2}bla*"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertTrue(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertFalse(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("bla", wcd.getLeadingOrTrailingLiteral()); + assertEquals("bla", wcd.getLeadingLiteral()); + assertNull(wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer25() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer25"); + String value = "bla{0,3}"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertTrue(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertFalse(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("bl", wcd.getLeadingOrTrailingLiteral()); + assertEquals("bl", wcd.getLeadingLiteral()); + assertNull(wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer26() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer26"); + String value = "bla{0}bla+"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("bl", wcd.getLeadingOrTrailingLiteral()); + assertEquals("bl", wcd.getLeadingLiteral()); + assertEquals("a", wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer27() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer27"); + String value = "(bla)+"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("bla", wcd.getLeadingOrTrailingLiteral()); + assertEquals("bla", wcd.getLeadingLiteral()); + assertEquals("bla", wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer28() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer28"); + String value = "(bla)*"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertTrue(wcd.isLeadingRegex()); + assertTrue(wcd.isTrailingRegex()); + assertFalse(wcd.isLeadingLiteral()); + assertFalse(wcd.isTrailingLiteral()); + assertTrue(wcd.isNgram()); + assertNull(wcd.getLeadingOrTrailingLiteral()); + assertNull(wcd.getLeadingLiteral()); + assertNull(wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer29() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer29"); + String value = "((foo)+(bar)+)+"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("foo", wcd.getLeadingOrTrailingLiteral()); + assertEquals("foo", wcd.getLeadingLiteral()); + assertEquals("bar", wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer30() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer30"); + String value = "((foo)+(bar*)+)+"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertTrue(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertFalse(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("foo", wcd.getLeadingOrTrailingLiteral()); + assertEquals("foo", wcd.getLeadingLiteral()); + assertNull(wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer31() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer31"); + String value = "((bar*)+)+"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertTrue(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertFalse(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("ba", wcd.getLeadingOrTrailingLiteral()); + assertEquals("ba", wcd.getLeadingLiteral()); + assertNull(wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer32() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer32"); + String value = "((bar*)+)*"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertTrue(wcd.isLeadingRegex()); + assertTrue(wcd.isTrailingRegex()); + assertFalse(wcd.isLeadingLiteral()); + assertFalse(wcd.isTrailingLiteral()); + assertTrue(wcd.isNgram()); + assertNull(wcd.getLeadingOrTrailingLiteral()); + assertNull(wcd.getLeadingLiteral()); + assertNull(wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer33() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer33"); + String value = "foo|bar"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertTrue(wcd.isLeadingRegex()); + assertTrue(wcd.isTrailingRegex()); + assertFalse(wcd.isLeadingLiteral()); + assertFalse(wcd.isTrailingLiteral()); + assertTrue(wcd.isNgram()); + assertNull(wcd.getLeadingOrTrailingLiteral()); + assertNull(wcd.getLeadingLiteral()); + assertNull(wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer34() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer34"); + String value = "(foo|bar)bar"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertTrue(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertFalse(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("bar", wcd.getLeadingOrTrailingLiteral()); + assertNull(wcd.getLeadingLiteral()); + assertEquals("bar", wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer35() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer35"); + String value = "foo(foo|bar)bar"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("foo", wcd.getLeadingOrTrailingLiteral()); + assertEquals("foo", wcd.getLeadingLiteral()); + assertEquals("bar", wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer36() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer36"); + String value = "foo(foo)|(bar)bar"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertTrue(wcd.isLeadingRegex()); + assertTrue(wcd.isTrailingRegex()); + assertFalse(wcd.isLeadingLiteral()); + assertFalse(wcd.isTrailingLiteral()); + assertTrue(wcd.isNgram()); + assertNull(wcd.getLeadingOrTrailingLiteral()); + assertNull(wcd.getLeadingLiteral()); + assertNull(wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer37() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer37"); + String value = "foo.(?\\$)bar"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertFalse(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("foobar", wcd.getLeadingOrTrailingLiteral()); + assertEquals("foobar", wcd.getLeadingLiteral()); + assertEquals("foobar", wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer39() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer39"); + String value = "(foo(x)?bar)"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("foo", wcd.getLeadingOrTrailingLiteral()); + assertEquals("foo", wcd.getLeadingLiteral()); + assertEquals("bar", wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer40() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer40"); + String value = "foo{1,}"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + log.debug("wcd.hasWildCard(): " + wcd.hasWildCard()); + log.debug("wcd.getLeadingOrTrailingLiteral(): " + wcd.getLeadingOrTrailingLiteral()); + assertTrue(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("foo", wcd.getLeadingOrTrailingLiteral()); + assertEquals("foo", wcd.getLeadingLiteral()); + assertEquals("o", wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer41() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer41"); + String value = "(?-icu)Friendly"; + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + assertFalse(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals("Friendly", wcd.getLeadingOrTrailingLiteral()); + assertEquals("Friendly", wcd.getLeadingLiteral()); + assertEquals("Friendly", wcd.getTrailingLiteral()); + } + + @Test + public void testRegexAnalyzer42() { + log.debug("---testRegexAnalyzer42"); + String value = "(?#icu)Friendly"; + + assertThrows(JavaRegexParseException.class, () -> new JavaRegexAnalyzer(value)); + } + + @Test + public void testRegexAnalyzerQuoting() throws JavaRegexParseException { + log.debug("---testRegexAnalyzerQuoting"); + Map values = new HashMap<>(); + values.put("\\Q+ae4\\E", "+ae4"); + values.put("abc\\Q+ae4\\E", "abc+ae4"); + values.put("\\Q+ae4\\Edef", "+ae4def"); + values.put("abc\\Q+ae4\\Edef", "abc+ae4def"); + for (String value : values.keySet()) { + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + assertFalse(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals(values.get(value), wcd.getLeadingOrTrailingLiteral()); + assertEquals(values.get(value), wcd.getLeadingLiteral()); + assertEquals(values.get(value), wcd.getTrailingLiteral()); + } + } + + @Test + public void testRegexAnalyzerBoundary() throws JavaRegexParseException { + log.debug("---testRegexAnalyzerQuoting"); + Map values = new HashMap<>(); + values.put("\\Bae4\\b", "ae4"); + values.put("\\Zae4\\z", "ae4"); + values.put("abc\\Gae4", "abcae4"); + values.put("\\Bae4\\Zdef", "ae4def"); + values.put("abc\\Aae4\\Gdef", "abcae4def"); + values.put("^abc\\Aae4\\Gdef$", "abcae4def"); + for (String value : values.keySet()) { + JavaRegexAnalyzer wcd = new JavaRegexAnalyzer(value); + assertEquals(value, wcd.getRegex()); + assertEquals(value, wcd.toString()); + log.debug("value: " + value); + assertFalse(wcd.hasWildCard()); + assertFalse(wcd.isLeadingRegex()); + assertFalse(wcd.isTrailingRegex()); + assertTrue(wcd.isLeadingLiteral()); + assertTrue(wcd.isTrailingLiteral()); + assertFalse(wcd.isNgram()); + assertEquals(values.get(value), wcd.getLeadingOrTrailingLiteral()); + assertEquals(values.get(value), wcd.getLeadingLiteral()); + assertEquals(values.get(value), wcd.getTrailingLiteral()); + } + } + + @Test + public void testZeroPadIpRegex() throws JavaRegexParseException { + log.debug("---testRegexAnalyzer18"); + + assertEquals("001\\.002\\.003\\.004", new JavaRegexAnalyzer("1\\.2\\.3\\.4").getZeroPadIpRegex()); + assertEquals("001\\.002\\.003\\.0{0,3}.*", new JavaRegexAnalyzer("1\\.2\\.3\\..*").getZeroPadIpRegex()); + assertEquals("001\\.002\\.0{0,3}.*", new JavaRegexAnalyzer("1\\.2\\..*").getZeroPadIpRegex()); + assertEquals("001\\.0{0,3}.*", new JavaRegexAnalyzer("1\\..*").getZeroPadIpRegex()); + + assertEquals("001\\.122\\.013\\.004", new JavaRegexAnalyzer("1\\.122\\.13\\.4").getZeroPadIpRegex()); + assertEquals("001\\.122\\.013\\.0{0,3}.*", new JavaRegexAnalyzer("1\\.122\\.13\\..*").getZeroPadIpRegex()); + assertEquals("091\\.122\\.0{0,3}.*", new JavaRegexAnalyzer("91\\.122\\..*").getZeroPadIpRegex()); + assertEquals("012\\.0{0,3}.*", new JavaRegexAnalyzer("12\\..*").getZeroPadIpRegex()); + + assertEquals("001\\.122\\.013\\.0{0,3}.*?", new JavaRegexAnalyzer("1\\.122\\.13\\..*?").getZeroPadIpRegex()); + assertEquals("091\\.122\\.0{0,3}.*+", new JavaRegexAnalyzer("91\\.122\\..*+").getZeroPadIpRegex()); + assertEquals("012\\.0{0,3}.*?", new JavaRegexAnalyzer("12\\..*?").getZeroPadIpRegex()); + + try { + assertEquals("00a\\.00b\\.00c\\.00d", new JavaRegexAnalyzer("a\\.b\\.c\\.d").getZeroPadIpRegex()); + fail("Expected letters to be invalid in an IP regex"); + } catch (JavaRegexParseException e) { + // expected + } + + assertEquals("001\\.027\\.0{0,3}.*\\.012", new JavaRegexAnalyzer("1\\.27\\..*\\.12").getZeroPadIpRegex()); + + assertEquals("078\\.038\\.218\\.0{0,3}.*?", new JavaRegexAnalyzer("78\\.38\\.218\\..*?").getZeroPadIpRegex()); + assertEquals("078\\.038\\.218\\....", new JavaRegexAnalyzer("78\\.38\\.218\\....").getZeroPadIpRegex()); + assertEquals("078\\.038\\.218\\.\\d\\d\\d", new JavaRegexAnalyzer("78\\.38\\.218\\.\\d\\d\\d").getZeroPadIpRegex()); + + assertEquals("0{0,3}8{0,3}\\.038\\.218\\.\\d\\d\\d", new JavaRegexAnalyzer("8{0,3}\\.38\\.218\\.\\d\\d\\d").getZeroPadIpRegex()); + assertEquals("0{1,3}8{0,2}\\.038\\.218\\.\\d\\d\\d", new JavaRegexAnalyzer("8{0,2}\\.38\\.218\\.\\d\\d\\d").getZeroPadIpRegex()); + assertEquals("0{0,2}08{0,2}\\.038\\.218\\.\\d\\d\\d", new JavaRegexAnalyzer("08{0,2}\\.38\\.218\\.\\d\\d\\d").getZeroPadIpRegex()); + + assertEquals("0.3\\.02.\\.3.3\\...3", new JavaRegexAnalyzer(".3\\.2.\\.3.3\\...3").getZeroPadIpRegex()); + + assertEquals("00\\x34\\.00\\ua425\\.00\\06\\.00\\p{Digit}", new JavaRegexAnalyzer("\\x34\\.\\ua425\\.\\06\\.\\p{Digit}").getZeroPadIpRegex()); + assertEquals("00\\0127\\.00\\063\\.00\\06\\.00\\P{Alpha}", new JavaRegexAnalyzer("\\0127\\.\\063\\.\\06\\.\\P{Alpha}").getZeroPadIpRegex()); + + assertEquals("234\\.234\\.234\\.00[\\p{L}&&[^\\p{Lu}]]", new JavaRegexAnalyzer("234\\.234\\.234\\.[\\p{L}&&[^\\p{Lu}]]").getZeroPadIpRegex()); + + assertEquals("0(3|4)2\\.0{0,1}(24|123)\\.0[234]4\\.123", new JavaRegexAnalyzer("(3|4)2\\.(24|123)\\.[234]4\\.123").getZeroPadIpRegex()); + + assertEquals("012\\.012\\.012\\.012|023\\.023\\.023\\.023", new JavaRegexAnalyzer("12\\.12\\.12\\.12|23\\.23\\.23\\.23").getZeroPadIpRegex()); + assertEquals("(012\\.012\\.012\\.012|023\\.023\\.023\\.023)", new JavaRegexAnalyzer("(12\\.12\\.12\\.12|23\\.23\\.23\\.23)").getZeroPadIpRegex()); + assertEquals("012\\.(012\\.012|023\\.023)\\.012", new JavaRegexAnalyzer("12\\.(12\\.12|23\\.23)\\.12").getZeroPadIpRegex()); + + assertEquals("012\\.(012\\.012|023\\.023)\\.0(1|2)2", new JavaRegexAnalyzer("12\\.(12\\.12|23\\.23)\\.(1|2)2").getZeroPadIpRegex()); + + // These required redistributing parts of tuples across a set of nested alternatives which contain a separator + // The best way to handle this is to redistribute the external parts within the grouped alternatives. + // This will break things if back references are included. Decided this is not required and should instead fail normalization + try { + // if the assert fails, then we probably attempted to enable this distribution thing or the detection and subsequent throw JavaRegexParseException + // failed + assertEquals("012\\.(012\\.03(3|5)|123\\.23(3|5))\\.012", new JavaRegexAnalyzer("12\\.1(2\\.3|23\\.23)(3|5)\\.12").getZeroPadIpRegex()); + } catch (JavaRegexParseException jrpe) { + // expected + } + try { + assertEquals("012\\.(012\\.03(3|5)|123\\.23(3|5))\\.012", new JavaRegexAnalyzer("12\\.1(2(\\.3|3\\.23))(3|5)\\.12").getZeroPadIpRegex()); + } catch (JavaRegexParseException jrpe) { + // expected + } + try { + assertEquals("012\\.(012\\.03(3|5)|123\\.23(3|5))\\.012", new JavaRegexAnalyzer("12\\.1(2(\\.|3\\.2)3)(3|5)\\.12").getZeroPadIpRegex()); + } catch (JavaRegexParseException jrpe) { + // expected + } + } + + // This will take at least 5 minutes to enumerate, not really something we want to run as unit test + @Test + @Disabled + public void extensiveIpAddressRegexs() throws JavaRegexParseException { + Stopwatch sw = Stopwatch.createUnstarted(); + sw.start(); + for (int i = 1; i < 256; i++) { + for (int j = 1; j < 256; j++) { + for (int k = 1; k < 256; k++) { + String expected = String.format("%03d\\.%03d\\.%03d\\.%s", i, j, k, ".*?"); + String origIp = String.format("%d\\.%d\\.%d\\.%s", i, j, k, ".*?"); + String paddedIp = new JavaRegexAnalyzer(origIp).getZeroPadIpRegex(); + assertEquals(expected, paddedIp); + } + } + } + sw.stop(); + } + + @Test + public void testDigitRegexs() throws JavaRegexParseException { + // Try to generate a list of potentially edge-case octet values (from the digit regex in the JavaRegexAnalyzer#zeroPadIpRegex(String) method + List octetsToEnumerate = Lists.newArrayList(1, 10, 20, 70, 100, 101, 120, 170, 200, 201, 220, 255); + + // Then enumerate all combinations of them to make sure they all generate zero-padded 1 through 3 octets with the trailing wildcard + for (Integer i : octetsToEnumerate) { + for (Integer j : octetsToEnumerate) { + for (Integer k : octetsToEnumerate) { + String expected = String.format("%03d\\.%03d\\.%03d\\.%s", i, j, k, "0{0,3}.*?"); + String origIp = String.format("%d\\.%d\\.%d\\.%s", i, j, k, ".*?"); + String paddedIp = new JavaRegexAnalyzer(origIp).getZeroPadIpRegex(); + assertEquals(expected, paddedIp); + } + } + } + } + + @Test + public void testRegexLowerCase() throws JavaRegexParseException { + Map testPatterns = new HashMap<>(); + testPatterns.put("No Wildcards", "no wildcards"); + testPatterns.put(".*No.*Escapes.*", ".*no.*escapes.*"); + testPatterns.put("\\\\Test \\\\Escapes\\\\\\D\\\\", "\\\\test \\\\escapes\\\\\\D\\\\"); + testPatterns.put("Test\\t\\nCharacter\\r\\f\\a\\e\\d\\D\\s\\S\\w\\W\\p{Print}\\p{XDigit}Classes", + "test\\t\\ncharacter\\r\\f\\a\\e\\d\\D\\s\\S\\w\\W\\p{Print}\\p{XDigit}classes"); + testPatterns.put("\\uFFFFTest\\x00\\x01\\05\\024\\0267Octal and Hex Character\\uFFFF\\uFE01\\xFE\\x0EClasses", + "\\uFFFFtest\\x00\\x01\\05\\024\\0267octal and hex character\\uFFFF\\uFE01\\xFE\\x0Eclasses"); + testPatterns.put("\\uFFFFTest\\Q\\uFFFF\\\\Quoted\\E\\uFFFFRegex", "\\uFFFFtest\\Q\\uffff\\\\quoted\\E\\uFFFFregex"); + testPatterns.put( + "\\p{Lower}\\p{Upper}[^\\p{Lower}][^\\p{Upper}]Test Upper And Lower Character Classes\\p{javaLowerCase}\\p{javaUpperCase}[^\\p{javaLowerCase}][^\\p{javaUpperCase}]", + "\\p{Lower}\\p{Lower}[\\p{Lower}][^\\p{Upper}]test upper and lower character classes\\p{javaLowerCase}\\p{javaLowerCase}[\\p{javaLowerCase}][^\\p{javaUpperCase}]"); + testPatterns.put( + "\\P{Lower}\\P{Upper}[^\\P{Lower}][^\\P{Upper}]Test Upper And Lower Negated Character Classes\\P{javaLowerCase}\\P{javaUpperCase}[^\\P{javaLowerCase}][^\\P{javaUpperCase}]", + "\\p{Lower}\\P{Upper}[^\\P{Lower}][\\P{Upper}]test upper and lower negated character classes\\p{javaLowerCase}\\P{javaUpperCase}[^\\P{javaLowerCase}][\\P{javaUpperCase}]"); + testPatterns.put("\\p{Lu}[^\\p{Lu}]Test Upper And Lower Character Classes[\\p{L}&&[^\\p{Lu}]]", + "\\p{L}[^\\p{Lu}]test upper and lower character classes[\\p{L}&&[^\\p{Lu}]]"); + testPatterns.put("\\P{Lu}[^\\P{Lu}]Test Upper And Lower Character Classes[\\p{L}&&[^\\P{Lu}]]", + "\\P{Lu}[\\P{Lu}]test upper and lower character classes[\\p{L}&&[\\P{Lu}]]"); + + for (Map.Entry testPattern : testPatterns.entrySet()) { + JavaRegexAnalyzer analyzer = new JavaRegexAnalyzer(testPattern.getKey()); + assertEquals(testPattern.getKey(), analyzer.getRegex()); + assertEquals(testPattern.getKey(), analyzer.toString()); + analyzer.applyRegexCaseSensitivity(false); + assertEquals(testPattern.getValue(), analyzer.getRegex()); + } + } + + @Test + public void testRegexUpperCase() throws JavaRegexParseException { + Map testPatterns = new HashMap<>(); + testPatterns.put("No Wildcards", "NO WILDCARDS"); + testPatterns.put(".*No.*Escapes.*", ".*NO.*ESCAPES.*"); + testPatterns.put("\\\\Test \\\\Escapes\\\\\\D\\\\", "\\\\TEST \\\\ESCAPES\\\\\\D\\\\"); + testPatterns.put("Test\\t\\nCharacter\\r\\f\\a\\e\\d\\D\\s\\S\\w\\W\\p{Print}\\p{XDigit}Classes", + "TEST\\t\\nCHARACTER\\r\\f\\a\\e\\d\\D\\s\\S\\w\\W\\p{Print}\\p{XDigit}CLASSES"); + testPatterns.put("\\uFFFFTest\\x00\\x01\\05\\024\\0267Octal and Hex Character\\uFFFF\\uFE01\\xFE\\x0EClasses", + "\\uFFFFTEST\\x00\\x01\\05\\024\\0267OCTAL AND HEX CHARACTER\\uFFFF\\uFE01\\xFE\\x0ECLASSES"); + testPatterns.put("\\uFFFFTest\\Q\\uFFFF\\\\Quoted\\E\\uFFFFRegex", "\\uFFFFTEST\\Q\\UFFFF\\\\QUOTED\\E\\uFFFFREGEX"); + testPatterns.put( + "\\p{Lower}\\p{Upper}[^\\p{Lower}][^\\p{Upper}]Test Upper And Lower Character Classes\\p{javaLowerCase}\\p{javaUpperCase}[^\\p{javaLowerCase}][^\\p{javaUpperCase}]", + "\\p{Upper}\\p{Upper}[^\\p{Lower}][\\p{Upper}]TEST UPPER AND LOWER CHARACTER CLASSES\\p{javaUpperCase}\\p{javaUpperCase}[^\\p{javaLowerCase}][\\p{javaUpperCase}]"); + testPatterns.put( + "\\P{Lower}\\P{Upper}[^\\P{Lower}][^\\P{Upper}]Test Upper And Lower Negated Character Classes\\P{javaLowerCase}\\P{javaUpperCase}[^\\P{javaLowerCase}][^\\P{javaUpperCase}]", + "\\P{Lower}\\p{Upper}[\\P{Lower}][^\\P{Upper}]TEST UPPER AND LOWER NEGATED CHARACTER CLASSES\\P{javaLowerCase}\\p{javaUpperCase}[\\P{javaLowerCase}][^\\P{javaUpperCase}]"); + testPatterns.put("\\p{Lu}[^\\p{Lu}]Test Upper And Lower Character Classes[\\p{L}&&[^\\p{Lu}]]", + "\\p{Lu}[\\p{Lu}]TEST UPPER AND LOWER CHARACTER CLASSES[\\p{L}&&[\\p{Lu}]]"); + testPatterns.put("\\P{Lu}[^\\P{Lu}]Test Upper And Lower Character Classes[\\p{L}&&[^\\P{Lu}]]", + "\\p{L}[^\\P{Lu}]TEST UPPER AND LOWER CHARACTER CLASSES[\\p{L}&&[^\\P{Lu}]]"); + + for (Map.Entry testPattern : testPatterns.entrySet()) { + JavaRegexAnalyzer analyzer = new JavaRegexAnalyzer(testPattern.getKey()); + assertEquals(testPattern.getKey(), analyzer.getRegex()); + assertEquals(testPattern.getKey(), analyzer.toString()); + analyzer.applyRegexCaseSensitivity(true); + assertEquals(testPattern.getValue(), analyzer.getRegex()); + } + } + + @Test + public void testRegexCaseFailures() { + Set testPatterns = new HashSet<>(); + testPatterns.add("Test\\p{Missing Curly Bracket"); + testPatterns.add("Test\\u\\wMissing Hex Digit"); + testPatterns.add("Test\\uF\\wMissing Hex Digit"); + testPatterns.add("Test\\uFF\\wMissing Hex Digit"); + testPatterns.add("Test\\uFFF\\wMissing Hex Digit"); + testPatterns.add("Test\\uFFFG\\wMissing Hex Digit"); + testPatterns.add("Test\\uGFFF\\wMissing Hex Digit"); + testPatterns.add("Test\\x\\wMissing Hex Digit"); + testPatterns.add("Test\\xF\\wMissing Hex Digit"); + testPatterns.add("Test\\xFG\\wMissing Hex Digit"); + testPatterns.add("Test\\xGF\\wMissing Hex Digit"); + testPatterns.add("Test\\c\\Missing Control Char"); + + for (String testPattern : testPatterns) { + try { + new JavaRegexAnalyzer(testPattern); + fail("Expected failure processing " + testPattern); + } catch (JavaRegexParseException e) { + // expected + } + } + } + +} diff --git a/core/utils/type-utils/src/test/java/datawave/webservice/query/data/ObjectSizeOfTest.java b/core/utils/type-utils/src/test/java/datawave/webservice/query/data/ObjectSizeOfTest.java new file mode 100644 index 00000000000..26a739eb314 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/webservice/query/data/ObjectSizeOfTest.java @@ -0,0 +1,115 @@ +package datawave.webservice.query.data; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +/** + * Test that object sizeof mechanism + */ +public class ObjectSizeOfTest { + + /** + * @throws java.lang.Exception + */ + @BeforeEach + public void setUp() throws Exception {} + + /** + * @throws java.lang.Exception + */ + @AfterEach + public void tearDown() throws Exception {} + + @Test + public void testPrimitives() { + assertEquals(0, ObjectSizeOf.Sizer.getPrimitiveObjectSize(void.class)); + assertEquals(1, ObjectSizeOf.Sizer.getPrimitiveObjectSize(boolean.class)); + assertEquals(1, ObjectSizeOf.Sizer.getPrimitiveObjectSize(byte.class)); + assertEquals(2, ObjectSizeOf.Sizer.getPrimitiveObjectSize(char.class)); + assertEquals(2, ObjectSizeOf.Sizer.getPrimitiveObjectSize(short.class)); + assertEquals(4, ObjectSizeOf.Sizer.getPrimitiveObjectSize(int.class)); + assertEquals(4, ObjectSizeOf.Sizer.getPrimitiveObjectSize(float.class)); + assertEquals(8, ObjectSizeOf.Sizer.getPrimitiveObjectSize(long.class)); + assertEquals(8, ObjectSizeOf.Sizer.getPrimitiveObjectSize(double.class)); + } + + @Test + public void testRoundUp() { + assertEquals(0, ObjectSizeOf.Sizer.roundUp(0)); + assertEquals(8, ObjectSizeOf.Sizer.roundUp(1)); + assertEquals(8, ObjectSizeOf.Sizer.roundUp(2)); + assertEquals(8, ObjectSizeOf.Sizer.roundUp(3)); + assertEquals(8, ObjectSizeOf.Sizer.roundUp(4)); + assertEquals(8, ObjectSizeOf.Sizer.roundUp(5)); + assertEquals(8, ObjectSizeOf.Sizer.roundUp(6)); + assertEquals(8, ObjectSizeOf.Sizer.roundUp(7)); + assertEquals(8, ObjectSizeOf.Sizer.roundUp(8)); + assertEquals(16, ObjectSizeOf.Sizer.roundUp(9)); + assertEquals(16, ObjectSizeOf.Sizer.roundUp(10)); + assertEquals(16, ObjectSizeOf.Sizer.roundUp(11)); + assertEquals(88, ObjectSizeOf.Sizer.roundUp(81)); + assertEquals(168, ObjectSizeOf.Sizer.roundUp(165)); + } + + @Test + public void testNumbers() { + assertEquals(16, ObjectSizeOf.Sizer.getObjectSize(Boolean.TRUE)); + assertEquals(16, ObjectSizeOf.Sizer.getObjectSize(new Byte((byte) 1))); + assertEquals(16, ObjectSizeOf.Sizer.getObjectSize(new Character((char) 1))); + assertEquals(16, ObjectSizeOf.Sizer.getObjectSize(new Short((short) 1))); + assertEquals(16, ObjectSizeOf.Sizer.getObjectSize(new Integer(1))); + assertEquals(16, ObjectSizeOf.Sizer.getObjectSize(new Float(1))); + assertEquals(16, ObjectSizeOf.Sizer.getObjectSize(new Long(1))); + assertEquals(16, ObjectSizeOf.Sizer.getObjectSize(new Double(1))); + } + + @Test + public void testObjects() { + List list = new ArrayList(10); + list.add(new Long(1)); + list.add(new Double(1)); + int overhead = 8; + int arrayoverhead = 12; + int reference = 4; + int intsize = 4; + int numbersize = 16; + long size = ObjectSizeOf.Sizer.roundUp(overhead + intsize + intsize + reference) + ObjectSizeOf.Sizer.roundUp(arrayoverhead + 10 * reference) + + numbersize + numbersize; + assertEquals(size, ObjectSizeOf.Sizer.getObjectSize(list)); + + PrimitiveObject testPrimitive = new PrimitiveObject(); + size = numbersize; + assertEquals(size, ObjectSizeOf.Sizer.getObjectSize(testPrimitive)); + + ObjectSizeOf testSized = new SizedObject(); + size = ObjectSizeOf.Sizer.roundUp(testSized.sizeInBytes()); + assertEquals(size, ObjectSizeOf.Sizer.getObjectSize(testSized)); + + RecursiveObject recursiveObject = new RecursiveObject(); + recursiveObject.o = recursiveObject; + size = ObjectSizeOf.Sizer.roundUp(overhead + reference); + assertEquals(size, ObjectSizeOf.Sizer.getObjectSize(recursiveObject)); + } + + public static class PrimitiveObject { + private long value = 0; + } + + public static class SizedObject implements ObjectSizeOf { + @Override + public long sizeInBytes() { + return ObjectSizeOf.Sizer.roundUp(20); + } + + } + + public static class RecursiveObject { + public Object o; + } +} diff --git a/core/utils/type-utils/src/test/java/datawave/webservice/query/util/TypedValueTest.java b/core/utils/type-utils/src/test/java/datawave/webservice/query/util/TypedValueTest.java new file mode 100644 index 00000000000..54ec5f50e68 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/webservice/query/util/TypedValueTest.java @@ -0,0 +1,216 @@ +package datawave.webservice.query.util; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.BufferedReader; +import java.io.ByteArrayOutputStream; +import java.io.InputStreamReader; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.text.MessageFormat; +import java.util.Calendar; +import java.util.Date; +import java.util.TimeZone; + +import javax.xml.bind.JAXBContext; + +import org.apache.commons.codec.binary.Base64; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import datawave.data.type.BaseType; +import datawave.data.type.LcNoDiacriticsType; +import datawave.data.type.NoOpType; + +public class TypedValueTest { + + private static String EXPECTED_FORMAT; + private static String EXPECTED_BASE64_FORMAT; + private JAXBContext ctx; + + @BeforeAll + public static void setUpClass() throws Exception { + System.setProperty("user.timezone", "GMT"); + BufferedReader rdr; + rdr = new BufferedReader(new InputStreamReader(TypedValueTest.class.getResourceAsStream("TypedValueExpectedUnencoded.xml"))); + EXPECTED_FORMAT = rdr.readLine(); + rdr.close(); + rdr = new BufferedReader(new InputStreamReader(TypedValueTest.class.getResourceAsStream("TypedValueExpectedEncoded.xml"))); + EXPECTED_BASE64_FORMAT = rdr.readLine(); + rdr.close(); + } + + @BeforeEach + public void setUp() throws Exception { + ctx = JAXBContext.newInstance(TypedValue.class); + } + + @Test + public void testPlainString() throws Exception { + TypedValue value = new TypedValue("plainString"); + String actual = serialize(value); + assertEquals(expected("xs:string", "plainString"), actual); + assertFalse(value.isBase64Encoded()); + assertEquals("xs:string", value.getType()); + } + + @Test + public void testEncodedStringString() throws Exception { + TypedValue value = new TypedValue("encoded\0String"); + String actual = serialize(value); + assertEquals(expected64("xs:string", new String(Base64.encodeBase64("encoded\0String".getBytes()))), actual); + assertTrue(value.isBase64Encoded()); + assertEquals("xs:string", value.getType()); + } + + @Test + public void testEncodedNoOpTypeString() throws Exception { + BaseType type = new NoOpType("encoded\0String"); + TypedValue value = new TypedValue(type); + String actual = serialize(value); + assertEquals(expected64("xs:string", new String(Base64.encodeBase64("encoded\0String".getBytes()))), actual); + assertTrue(value.isBase64Encoded()); + assertEquals("xs:string", value.getType()); + } + + @Test + public void testEncodedTypeString() throws Exception { + BaseType type = new LcNoDiacriticsType("encoded\0String"); + TypedValue value = new TypedValue(type); + String actual = serialize(value); + assertEquals(expected64("xs:string", new String(Base64.encodeBase64("encoded\0String".getBytes()))), actual); + assertTrue(value.isBase64Encoded()); + assertEquals("xs:string", value.getType()); + } + + @Test + public void testBoolean() throws Exception { + TypedValue value = new TypedValue(Boolean.TRUE); + String actual = serialize(value); + assertEquals(expected("xs:boolean", "true"), actual); + assertFalse(value.isBase64Encoded()); + assertEquals("xs:boolean", value.getType()); + } + + @Test + public void testShort() throws Exception { + TypedValue value = new TypedValue((short) 42); + String actual = serialize(value); + assertEquals(expected("xs:short", "42"), actual); + assertFalse(value.isBase64Encoded()); + assertEquals("xs:short", value.getType()); + } + + @Test + public void testInteger() throws Exception { + TypedValue value = new TypedValue((int) 42); + String actual = serialize(value); + assertEquals(expected("xs:int", "42"), actual); + assertFalse(value.isBase64Encoded()); + assertEquals("xs:int", value.getType()); + } + + @Test + public void testLong() throws Exception { + TypedValue value = new TypedValue(42L); + String actual = serialize(value); + assertEquals(expected("xs:long", "42"), actual); + assertFalse(value.isBase64Encoded()); + assertEquals("xs:long", value.getType()); + } + + @Test + public void testFloat() throws Exception { + TypedValue value = new TypedValue(42.42f); + String actual = serialize(value); + assertEquals(expected("xs:float", "42.42"), actual); + assertFalse(value.isBase64Encoded()); + assertEquals("xs:float", value.getType()); + } + + @Test + public void testDouble() throws Exception { + TypedValue value = new TypedValue(42.42); + String actual = serialize(value); + assertEquals(expected("xs:double", "42.42"), actual); + assertFalse(value.isBase64Encoded()); + assertEquals("xs:double", value.getType()); + } + + @Test + public void testBigDecimal() throws Exception { + TypedValue value = new TypedValue(new BigDecimal("123456789012345678901234567890.123")); + String actual = serialize(value); + assertEquals(expected("xs:decimal", "123456789012345678901234567890.123"), actual); + assertFalse(value.isBase64Encoded()); + assertEquals("xs:decimal", value.getType()); + } + + @Test + public void testBigInteger() throws Exception { + TypedValue value = new TypedValue(new BigInteger("123456789012345678901234567890")); + String actual = serialize(value); + assertEquals(expected("xs:integer", "123456789012345678901234567890"), actual); + assertFalse(value.isBase64Encoded()); + assertEquals("xs:integer", value.getType()); + } + + @Test + public void testByte() throws Exception { + TypedValue value = new TypedValue((byte) 42); + String actual = serialize(value); + assertEquals(expected("xs:byte", "42"), actual); + assertFalse(value.isBase64Encoded()); + assertEquals("xs:byte", value.getType()); + } + + @Test + public void testByteArray() throws Exception { + TypedValue value = new TypedValue(new byte[] {(byte) 1, (byte) 2, (byte) 3, (byte) 42}); + String actual = serialize(value); + assertEquals(expected("xs:base64Binary", new String(Base64.encodeBase64((byte[]) value.getValue()))), actual); + assertFalse(value.isBase64Encoded()); + assertEquals("xs:base64Binary", value.getType()); + } + + @Test + public void testDateTime() throws Exception { + Calendar cal = Calendar.getInstance(); + cal.setTimeZone(TimeZone.getTimeZone("EST")); + cal.set(2012, 0, 25, 14, 34, 35); + cal.set(Calendar.MILLISECOND, 525); + TypedValue value = new TypedValue(cal); + String actual = serialize(value); + assertEquals(expected("xs:dateTime", "2012-01-25T14:34:35.525-05:00"), actual); + assertFalse(value.isBase64Encoded()); + assertEquals("xs:dateTime", value.getType()); + } + + @Test + public void testDate() throws Exception { + Date d = new Date(1408552225400L); + TypedValue value = new TypedValue(d); + String actual = serialize(value); + assertEquals(expected("xs:dateTime", "2014-08-20T16:30:25.400Z"), actual); + assertFalse(value.isBase64Encoded()); + assertEquals("xs:dateTime", value.getType()); + } + + private String expected(String xsdType, String value) { + return MessageFormat.format(EXPECTED_FORMAT, xsdType, value); + } + + private String expected64(String xsdType, String value) { + return MessageFormat.format(EXPECTED_BASE64_FORMAT, xsdType, value); + } + + private String serialize(TypedValue value) throws Exception { + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + ctx.createMarshaller().marshal(value, bos); + return bos.toString(); + } + +} diff --git a/core/utils/type-utils/src/test/java/datawave/webservice/query/util/XMLUtilTest.java b/core/utils/type-utils/src/test/java/datawave/webservice/query/util/XMLUtilTest.java new file mode 100644 index 00000000000..5f14224ddc4 --- /dev/null +++ b/core/utils/type-utils/src/test/java/datawave/webservice/query/util/XMLUtilTest.java @@ -0,0 +1,26 @@ +package datawave.webservice.query.util; + +import static org.junit.jupiter.api.Assertions.*; + +import org.apache.xerces.util.XMLChar; +import org.junit.jupiter.api.Test; + +public class XMLUtilTest { + @Test + public void testAllCharacters() { + for (int i = 0; i < 0x300000; ++i) { + boolean expectedResult = XMLChar.isValid(i); + assertEquals(expectedResult, XMLUtil.isValidXMLChar(i), "Mismatch for 0x" + Integer.toHexString(i)); + } + } + + @Test + public void testValidXMLString() { + assertTrue(XMLUtil.isValidXML("This is valid XML \u0009\r\n \u0021 \uD1FF")); + } + + @Test + public void testInvalidXMLString() { + assertFalse(XMLUtil.isValidXML("This \u0002 is not valid")); + } +} diff --git a/core/utils/type-utils/src/test/resources/datawave/data/normalizer/geoRanges.txt b/core/utils/type-utils/src/test/resources/datawave/data/normalizer/geoRanges.txt new file mode 100644 index 00000000000..3dc4da3d05b --- /dev/null +++ b/core/utils/type-utils/src/test/resources/datawave/data/normalizer/geoRanges.txt @@ -0,0 +1 @@ +000001000103020d020d020702080202020203350335031f0320030a030a04d504d5047f0480042a042a0503550503550501ff0502000500aa0500aa060d54060d570607fc0608030602a80602ab07355007355f071ff007200f070aa0070aaf08d54008d57f087fc008803f082a80082abf090355fd090355fd090355f2090355f9090355ed090355ee090355c6090355e9090355c1090355c2090355b2090355bd090355ad090355ae09035516090355a90903551109035512090355020903550d090200fd090200fe090200c6090200f9090200c1090200c2090200b2090200bd090200ad090200ae090200a7090200a80902009b090200a409020097090200980902006b0902009409020067090200680902005b0902006409020057090200580901ffab090200540901ffa70901ffa80901ff9b0901ffa40901ff970901ff980901ff6b0901ff940901ff670901ff680901ff5b0901ff640901ff570901ff580901ff510901ff520901ff420901ff4d0901ff3d0901ff3e0901ff060901ff390901ff010901ff020900aaf20900aafd0900aaed0900aaee0900aa560900aae90900aa510900aa520900aa420900aa4d0900aa3d0900aa3e0900aa160900aa390900aa110900aa120900aa060900aa0d0900aa020900aa020a0d57f50a0d57f50a0d57e30a0d57e50a0d57cf0a0d57e00a0d57ca0a0d57cc0a0d57b90a0d57ba0a0d57b50a0d57b60a0d57a30a0d57a50a0d571f0a0d57a00a0d571a0a0d571c0a0d57090a0d570a0a0d57050a0d57060a0d56f30a0d56f50a0d56cf0a0d56f00a0d56ca0a0d56cc0a0d56b90a0d56ba0a0d56b50a0d56b60a0d56a30a0d56a50a0d545f0a0d56a00a0d545a0a0d545c0a0d54490a0d544a0a0d54450a0d54460a0d54330a0d54350a0d540f0a0d54300a0d540a0a0d540c0a0803f90a0803fa0a0803f50a0803f60a0803e30a0803e50a08031f0a0803e00a08031a0a08031c0a0803090a08030a0a0803050a0803060a0802f30a0802f50a0802cf0a0802f00a0802ca0a0802cc0a0802b90a0802ba0a0802b50a0802b60a08029e0a0802a00a08026e0a0802910a08025e0a0802610a0801ae0a0802510a08019e0a0801a10a08016e0a0801910a08015e0a0801610a07feae0a0801510a07fe9e0a07fea10a07fe6e0a07fe910a07fe5e0a07fe610a07fdae0a07fe510a07fd9e0a07fda10a07fd6e0a07fd910a07fd5f0a07fd610a07fd490a07fd4a0a07fd450a07fd460a07fd330a07fd350a07fd0f0a07fd300a07fd0a0a07fd0c0a07fcf90a07fcfa0a07fcf50a07fcf60a07fce30a07fce50a07fc1f0a07fce00a07fc1a0a07fc1c0a07fc090a07fc0a0a07fc050a07fc060a02abf30a02abf50a02abcf0a02abf00a02abca0a02abcc0a02abb90a02abba0a02abb50a02abb60a02aba30a02aba50a02a95f0a02aba00a02a95a0a02a95c0a02a9490a02a94a0a02a9450a02a9460a02a9330a02a9350a02a90f0a02a9300a02a90a0a02a90c0a02a8f90a02a8fa0a02a8f50a02a8f60a02a8e30a02a8e50a02a85f0a02a8e00a02a85a0a02a85c0a02a8490a02a84a0a02a8450a02a8460a02a8330a02a8350a02a81f0a02a8300a02a81a0a02a81c0a02a80a0a02a80a0b355fd50b355fd50b355f930b355f950b355f8e0b355f900b355f3e0b355f810b355f2f0b355f310b355f2a0b355f2c0b355ee90b355eea0b355ee50b355ee60b355ed90b355eda0b355ed50b355ed60b355e930b355e950b355e8e0b355e900b355c7e0b355e810b355c6f0b355c710b355c6a0b355c6c0b355c290b355c2a0b355c250b355c260b355c190b355c1a0b355c150b355c160b355bd30b355bd50b355bce0b355bd00b355b3e0b355bc10b355b2f0b355b310b355b2a0b355b2c0b355ae90b355aea0b355ae50b355ae60b355ad90b355ada0b355ad50b355ad60b355a930b355a950b355a8e0b355a900b35517e0b355a810b35516f0b3551710b35516a0b35516c0b3551290b35512a0b3551250b3551260b3551190b35511a0b3551150b3551160b3550d30b3550d50b3550ce0b3550d00b35503e0b3550c10b35502f0b3550310b35502a0b35502c0b200fe90b200fea0b200fe50b200fe60b200fd90b200fda0b200fd50b200fd60b200f930b200f950b200f8e0b200f900b200c7e0b200f810b200c6f0b200c710b200c6a0b200c6c0b200c290b200c2a0b200c250b200c260b200c190b200c1a0b200c150b200c160b200bd30b200bd50b200bce0b200bd00b200b3e0b200bc10b200b2f0b200b310b200b2a0b200b2c0b200ae90b200aea0b200ae50b200ae60b200ad90b200ada0b200ad50b200ad60b200a7f0b200a800b200a7a0b200a7c0b200a430b200a450b2009bf0b200a400b2009ba0b2009bc0b2009830b2009850b20097f0b2009800b20097a0b20097c0b2009430b2009450b2006bf0b2009400b2006ba0b2006bc0b2006830b2006850b20067f0b2006800b20067a0b20067c0b2006430b2006450b2005bf0b2006400b2005ba0b2005bc0b2005830b2005850b20057f0b2005800b20057a0b20057c0b2005430b2005450b1ffabf0b2005400b1ffaba0b1ffabc0b1ffa830b1ffa850b1ffa7f0b1ffa800b1ffa7a0b1ffa7c0b1ffa430b1ffa450b1ff9bf0b1ffa400b1ff9ba0b1ff9bc0b1ff9830b1ff9850b1ff97f0b1ff9800b1ff97a0b1ff97c0b1ff9430b1ff9450b1ff6bf0b1ff9400b1ff6ba0b1ff6bc0b1ff6830b1ff6850b1ff67f0b1ff6800b1ff67a0b1ff67c0b1ff6430b1ff6450b1ff5bf0b1ff6400b1ff5ba0b1ff5bc0b1ff5830b1ff5850b1ff57f0b1ff5800b1ff5290b1ff52a0b1ff5250b1ff5260b1ff5190b1ff51a0b1ff5150b1ff5160b1ff4d30b1ff4d50b1ff4ce0b1ff4d00b1ff43e0b1ff4c10b1ff42f0b1ff4310b1ff42a0b1ff42c0b1ff3e90b1ff3ea0b1ff3e50b1ff3e60b1ff3d90b1ff3da0b1ff3d50b1ff3d60b1ff3930b1ff3950b1ff38e0b1ff3900b1ff07e0b1ff3810b1ff06f0b1ff0710b1ff06a0b1ff06c0b1ff0290b1ff02a0b1ff0250b1ff0260b1ff0190b1ff01a0b1ff0150b1ff0160b0aafd30b0aafd50b0aafce0b0aafd00b0aaf3e0b0aafc10b0aaf2f0b0aaf310b0aaf2a0b0aaf2c0b0aaee90b0aaeea0b0aaee50b0aaee60b0aaed90b0aaeda0b0aaed50b0aaed60b0aae930b0aae950b0aae8e0b0aae900b0aa57e0b0aae810b0aa56f0b0aa5710b0aa56a0b0aa56c0b0aa5290b0aa52a0b0aa5250b0aa5260b0aa5190b0aa51a0b0aa5150b0aa5160b0aa4d30b0aa4d50b0aa4ce0b0aa4d00b0aa43e0b0aa4c10b0aa42f0b0aa4310b0aa42a0b0aa42c0b0aa3e90b0aa3ea0b0aa3e50b0aa3e60b0aa3d90b0aa3da0b0aa3d50b0aa3d60b0aa3930b0aa3950b0aa38e0b0aa3900b0aa17e0b0aa3810b0aa16f0b0aa1710b0aa16a0b0aa16c0b0aa1290b0aa12a0b0aa1250b0aa1260b0aa1190b0aa11a0b0aa1150b0aa1160b0aa0d30b0aa0d50b0aa0ce0b0aa0d00b0aa07e0b0aa0c10b0aa06f0b0aa0710b0aa06a0b0aa06c0b0aa02a0b0aa02a0cd57f540cd57f570cd57e4c0cd57e570cd57e380cd57e430cd57cf80cd57e070cd57cbc0cd57cc70cd57ca80cd57cb30cd57ba40cd57bab0cd57b940cd57b9b0cd57b640cd57b6b0cd57b540cd57b5b0cd57a4c0cd57a570cd57a380cd57a430cd571f80cd57a070cd571bc0cd571c70cd571a80cd571b30cd570a40cd570ab0cd570940cd5709b0cd570640cd5706b0cd570540cd5705b0cd56f4c0cd56f570cd56f380cd56f430cd56cf80cd56f070cd56cbc0cd56cc70cd56ca80cd56cb30cd56ba40cd56bab0cd56b940cd56b9b0cd56b640cd56b6b0cd56b540cd56b5b0cd56a4c0cd56a570cd56a380cd56a430cd545f80cd56a070cd545bc0cd545c70cd545a80cd545b30cd544a40cd544ab0cd544940cd5449b0cd544640cd5446b0cd544540cd5445b0cd5434c0cd543570cd543380cd543430cd540f80cd543070cd540bc0cd540c70cd540a80cd540b30c803fa40c803fab0c803f940c803f9b0c803f640c803f6b0c803f540c803f5b0c803e4c0c803e570c803e380c803e430c8031f80c803e070c8031bc0c8031c70c8031a80c8031b30c8030a40c8030ab0c8030940c80309b0c8030640c80306b0c8030540c80305b0c802f4c0c802f570c802f380c802f430c802cf80c802f070c802cbc0c802cc70c802ca80c802cb30c802ba40c802bab0c802b940c802b9b0c802b640c802b6b0c802b540c802b5b0c8029fc0c802a030c8029e80c8029f30c80290c0c8029170c8026fc0c8029030c8026e80c8026f30c80260c0c8026170c8025fc0c8026030c8025e80c8025f30c80250c0c8025170c801afc0c8025030c801ae80c801af30c801a0c0c801a170c8019fc0c801a030c8019e80c8019f30c80190c0c8019170c8016fc0c8019030c8016e80c8016f30c80160c0c8016170c8015fc0c8016030c8015e80c8015f30c80150c0c8015170c7feafc0c8015030c7feae80c7feaf30c7fea0c0c7fea170c7fe9fc0c7fea030c7fe9e80c7fe9f30c7fe90c0c7fe9170c7fe6fc0c7fe9030c7fe6e80c7fe6f30c7fe60c0c7fe6170c7fe5fc0c7fe6030c7fe5e80c7fe5f30c7fe50c0c7fe5170c7fdafc0c7fe5030c7fdae80c7fdaf30c7fda0c0c7fda170c7fd9fc0c7fda030c7fd9e80c7fd9f30c7fd90c0c7fd9170c7fd6fc0c7fd9030c7fd6e80c7fd6f30c7fd60c0c7fd6170c7fd5fc0c7fd6030c7fd4a40c7fd4ab0c7fd4940c7fd49b0c7fd4640c7fd46b0c7fd4540c7fd45b0c7fd34c0c7fd3570c7fd3380c7fd3430c7fd0f80c7fd3070c7fd0bc0c7fd0c70c7fd0a80c7fd0b30c7fcfa40c7fcfab0c7fcf940c7fcf9b0c7fcf640c7fcf6b0c7fcf540c7fcf5b0c7fce4c0c7fce570c7fce380c7fce430c7fc1f80c7fce070c7fc1bc0c7fc1c70c7fc1a80c7fc1b30c7fc0a40c7fc0ab0c7fc0940c7fc09b0c7fc0640c7fc06b0c7fc0540c7fc05b0c2abf4c0c2abf570c2abf380c2abf430c2abcf80c2abf070c2abcbc0c2abcc70c2abca80c2abcb30c2abba40c2abbab0c2abb940c2abb9b0c2abb640c2abb6b0c2abb540c2abb5b0c2aba4c0c2aba570c2aba380c2aba430c2a95f80c2aba070c2a95bc0c2a95c70c2a95a80c2a95b30c2a94a40c2a94ab0c2a94940c2a949b0c2a94640c2a946b0c2a94540c2a945b0c2a934c0c2a93570c2a93380c2a93430c2a90f80c2a93070c2a90bc0c2a90c70c2a90a80c2a90b30c2a8fa40c2a8fab0c2a8f940c2a8f9b0c2a8f640c2a8f6b0c2a8f540c2a8f5b0c2a8e4c0c2a8e570c2a8e380c2a8e430c2a85f80c2a8e070c2a85bc0c2a85c70c2a85a80c2a85b30c2a84a40c2a84ab0c2a84940c2a849b0c2a84640c2a846b0c2a84540c2a845b0c2a834c0c2a83570c2a83380c2a83430c2a81f80c2a83070c2a81bc0c2a81c70c2a81a80c2a81b30c2a80a80c2a80ab0d0355fd500d0355fd5f0d0355f9300d0355f95f0d0355f8e00d0355f90f0d0355f3e00d0355f81f0d0355f2f00d0355f31f0d0355f2a00d0355f2cf0d0355ee900d0355eeaf0d0355ee500d0355ee6f0d0355ed900d0355edaf0d0355ed500d0355ed6f0d0355e9300d0355e95f0d0355e8e00d0355e90f0d0355c7e00d0355e81f0d0355c6f00d0355c71f0d0355c6a00d0355c6cf0d0355c2900d0355c2af0d0355c2500d0355c26f0d0355c1900d0355c1af0d0355c1500d0355c16f0d0355bd300d0355bd5f0d0355bce00d0355bd0f0d0355b3e00d0355bc1f0d0355b2f00d0355b31f0d0355b2a00d0355b2cf0d0355ae900d0355aeaf0d0355ae500d0355ae6f0d0355ad900d0355adaf0d0355ad500d0355ad6f0d0355a9300d0355a95f0d0355a8e00d0355a90f0d035517e00d0355a81f0d035516f00d0355171f0d035516a00d035516cf0d035512900d035512af0d035512500d0355126f0d035511900d035511af0d035511500d0355116f0d03550d300d03550d5f0d03550ce00d03550d0f0d035503e00d03550c1f0d035502f00d0355031f0d035502a00d035502cf0d0200fe900d0200feaf0d0200fe500d0200fe6f0d0200fd900d0200fdaf0d0200fd500d0200fd6f0d0200f9300d0200f95f0d0200f8e00d0200f90f0d0200c7e00d0200f81f0d0200c6f00d0200c71f0d0200c6a00d0200c6cf0d0200c2900d0200c2af0d0200c2500d0200c26f0d0200c1900d0200c1af0d0200c1500d0200c16f0d0200bd300d0200bd5f0d0200bce00d0200bd0f0d0200b3e00d0200bc1f0d0200b2f00d0200b31f0d0200b2a00d0200b2cf0d0200ae900d0200aeaf0d0200ae500d0200ae6f0d0200ad900d0200adaf0d0200ad500d0200ad6f0d0200a7f00d0200a80f0d0200a7a00d0200a7cf0d0200a4300d0200a45f0d02009bf00d0200a40f0d02009ba00d02009bcf0d020098300d0200985f0d020097f00d0200980f0d020097a00d020097cf0d020094300d0200945f0d02006bf00d0200940f0d02006ba00d02006bcf0d020068300d0200685f0d020067f00d0200680f0d020067a00d020067cf0d020064300d0200645f0d02005bf00d0200640f0d02005ba00d02005bcf0d020058300d0200585f0d020057f00d0200580f0d020057a00d020057cf0d020054300d0200545f0d01ffabf00d0200540f0d01ffaba00d01ffabcf0d01ffa8300d01ffa85f0d01ffa7f00d01ffa80f0d01ffa7a00d01ffa7cf0d01ffa4300d01ffa45f0d01ff9bf00d01ffa40f0d01ff9ba00d01ff9bcf0d01ff98300d01ff985f0d01ff97f00d01ff980f0d01ff97a00d01ff97cf0d01ff94300d01ff945f0d01ff6bf00d01ff940f0d01ff6ba00d01ff6bcf0d01ff68300d01ff685f0d01ff67f00d01ff680f0d01ff67a00d01ff67cf0d01ff64300d01ff645f0d01ff5bf00d01ff640f0d01ff5ba00d01ff5bcf0d01ff58300d01ff585f0d01ff57f00d01ff580f0d01ff52900d01ff52af0d01ff52500d01ff526f0d01ff51900d01ff51af0d01ff51500d01ff516f0d01ff4d300d01ff4d5f0d01ff4ce00d01ff4d0f0d01ff43e00d01ff4c1f0d01ff42f00d01ff431f0d01ff42a00d01ff42cf0d01ff3e900d01ff3eaf0d01ff3e500d01ff3e6f0d01ff3d900d01ff3daf0d01ff3d500d01ff3d6f0d01ff39300d01ff395f0d01ff38e00d01ff390f0d01ff07e00d01ff381f0d01ff06f00d01ff071f0d01ff06a00d01ff06cf0d01ff02900d01ff02af0d01ff02500d01ff026f0d01ff01900d01ff01af0d01ff01500d01ff016f0d00aafd300d00aafd5f0d00aafce00d00aafd0f0d00aaf3e00d00aafc1f0d00aaf2f00d00aaf31f0d00aaf2a00d00aaf2cf0d00aaee900d00aaeeaf0d00aaee500d00aaee6f0d00aaed900d00aaedaf0d00aaed500d00aaed6f0d00aae9300d00aae95f0d00aae8e00d00aae90f0d00aa57e00d00aae81f0d00aa56f00d00aa571f0d00aa56a00d00aa56cf0d00aa52900d00aa52af0d00aa52500d00aa526f0d00aa51900d00aa51af0d00aa51500d00aa516f0d00aa4d300d00aa4d5f0d00aa4ce00d00aa4d0f0d00aa43e00d00aa4c1f0d00aa42f00d00aa431f0d00aa42a00d00aa42cf0d00aa3e900d00aa3eaf0d00aa3e500d00aa3e6f0d00aa3d900d00aa3daf0d00aa3d500d00aa3d6f0d00aa39300d00aa395f0d00aa38e00d00aa390f0d00aa17e00d00aa381f0d00aa16f00d00aa171f0d00aa16a00d00aa16cf0d00aa12900d00aa12af0d00aa12500d00aa126f0d00aa11900d00aa11af0d00aa11500d00aa116f0d00aa0d300d00aa0d5f0d00aa0ce00d00aa0d0f0d00aa07e00d00aa0c1f0d00aa06f00d00aa071f0d00aa06a00d00aa06cf0d00aa02a00d00aa02af0e0d57f5400e0d57f57f0e0d57e4c00e0d57e57f0e0d57e3800e0d57e43f0e0d57cf800e0d57e07f0e0d57cbc00e0d57cc7f0e0d57ca800e0d57cb3f0e0d57ba400e0d57babf0e0d57b9400e0d57b9bf0e0d57b6400e0d57b6bf0e0d57b5400e0d57b5bf0e0d57a4c00e0d57a57f0e0d57a3800e0d57a43f0e0d571f800e0d57a07f0e0d571bc00e0d571c7f0e0d571a800e0d571b3f0e0d570a400e0d570abf0e0d5709400e0d5709bf0e0d5706400e0d5706bf0e0d5705400e0d5705bf0e0d56f4c00e0d56f57f0e0d56f3800e0d56f43f0e0d56cf800e0d56f07f0e0d56cbc00e0d56cc7f0e0d56ca800e0d56cb3f0e0d56ba400e0d56babf0e0d56b9400e0d56b9bf0e0d56b6400e0d56b6bf0e0d56b5400e0d56b5bf0e0d56a4c00e0d56a57f0e0d56a3800e0d56a43f0e0d545f800e0d56a07f0e0d545bc00e0d545c7f0e0d545a800e0d545b3f0e0d544a400e0d544abf0e0d5449400e0d5449bf0e0d5446400e0d5446bf0e0d5445400e0d5445bf0e0d5434c00e0d54357f0e0d5433800e0d54343f0e0d540f800e0d54307f0e0d540bc00e0d540c7f0e0d540a800e0d540b3f0e0803fa400e0803fabf0e0803f9400e0803f9bf0e0803f6400e0803f6bf0e0803f5400e0803f5bf0e0803e4c00e0803e57f0e0803e3800e0803e43f0e08031f800e0803e07f0e08031bc00e08031c7f0e08031a800e08031b3f0e08030a400e08030abf0e080309400e080309bf0e080306400e080306bf0e080305400e080305bf0e0802f4c00e0802f57f0e0802f3800e0802f43f0e0802cf800e0802f07f0e0802cbc00e0802cc7f0e0802ca800e0802cb3f0e0802ba400e0802babf0e0802b9400e0802b9bf0e0802b6400e0802b6bf0e0802b5400e0802b5bf0e08029fc00e0802a03f0e08029e800e08029f3f0e080290c00e0802917f0e08026fc00e0802903f0e08026e800e08026f3f0e080260c00e0802617f0e08025fc00e0802603f0e08025e800e08025f3f0e080250c00e0802517f0e0801afc00e0802503f0e0801ae800e0801af3f0e0801a0c00e0801a17f0e08019fc00e0801a03f0e08019e800e08019f3f0e080190c00e0801917f0e08016fc00e0801903f0e08016e800e08016f3f0e080160c00e0801617f0e08015fc00e0801603f0e08015e800e08015f3f0e080150c00e0801517f0e07feafc00e0801503f0e07feae800e07feaf3f0e07fea0c00e07fea17f0e07fe9fc00e07fea03f0e07fe9e800e07fe9f3f0e07fe90c00e07fe917f0e07fe6fc00e07fe903f0e07fe6e800e07fe6f3f0e07fe60c00e07fe617f0e07fe5fc00e07fe603f0e07fe5e800e07fe5f3f0e07fe50c00e07fe517f0e07fdafc00e07fe503f0e07fdae800e07fdaf3f0e07fda0c00e07fda17f0e07fd9fc00e07fda03f0e07fd9e800e07fd9f3f0e07fd90c00e07fd917f0e07fd6fc00e07fd903f0e07fd6e800e07fd6f3f0e07fd60c00e07fd617f0e07fd5fc00e07fd603f0e07fd4a400e07fd4abf0e07fd49400e07fd49bf0e07fd46400e07fd46bf0e07fd45400e07fd45bf0e07fd34c00e07fd357f0e07fd33800e07fd343f0e07fd0f800e07fd307f0e07fd0bc00e07fd0c7f0e07fd0a800e07fd0b3f0e07fcfa400e07fcfabf0e07fcf9400e07fcf9bf0e07fcf6400e07fcf6bf0e07fcf5400e07fcf5bf0e07fce4c00e07fce57f0e07fce3800e07fce43f0e07fc1f800e07fce07f0e07fc1bc00e07fc1c7f0e07fc1a800e07fc1b3f0e07fc0a400e07fc0abf0e07fc09400e07fc09bf0e07fc06400e07fc06bf0e07fc05400e07fc05bf0e02abf4c00e02abf57f0e02abf3800e02abf43f0e02abcf800e02abf07f0e02abcbc00e02abcc7f0e02abca800e02abcb3f0e02abba400e02abbabf0e02abb9400e02abb9bf0e02abb6400e02abb6bf0e02abb5400e02abb5bf0e02aba4c00e02aba57f0e02aba3800e02aba43f0e02a95f800e02aba07f0e02a95bc00e02a95c7f0e02a95a800e02a95b3f0e02a94a400e02a94abf0e02a949400e02a949bf0e02a946400e02a946bf0e02a945400e02a945bf0e02a934c00e02a9357f0e02a933800e02a9343f0e02a90f800e02a9307f0e02a90bc00e02a90c7f0e02a90a800e02a90b3f0e02a8fa400e02a8fabf0e02a8f9400e02a8f9bf0e02a8f6400e02a8f6bf0e02a8f5400e02a8f5bf0e02a8e4c00e02a8e57f0e02a8e3800e02a8e43f0e02a85f800e02a8e07f0e02a85bc00e02a85c7f0e02a85a800e02a85b3f0e02a84a400e02a84abf0e02a849400e02a849bf0e02a846400e02a846bf0e02a845400e02a845bf0e02a834c00e02a8357f0e02a833800e02a8343f0e02a81f800e02a8307f0e02a81bc00e02a81c7f0e02a81a800e02a81b3f0e02a80a800e02a80abf0f355fd5000f355fd5ff0f355f93000f355f95ff0f355f8e000f355f90ff0f355f3e000f355f81ff0f355f2f000f355f31ff0f355f2a000f355f2cff0f355ee9000f355eeaff0f355ee5000f355ee6ff0f355ed9000f355edaff0f355ed5000f355ed6ff0f355e93000f355e95ff0f355e8e000f355e90ff0f355c7e000f355e81ff0f355c6f000f355c71ff0f355c6a000f355c6cff0f355c29000f355c2aff0f355c25000f355c26ff0f355c19000f355c1aff0f355c15000f355c16ff0f355bd3000f355bd5ff0f355bce000f355bd0ff0f355b3e000f355bc1ff0f355b2f000f355b31ff0f355b2a000f355b2cff0f355ae9000f355aeaff0f355ae5000f355ae6ff0f355ad9000f355adaff0f355ad5000f355ad6ff0f355a93000f355a95ff0f355a8e000f355a90ff0f35517e000f355a81ff0f35516f000f355171ff0f35516a000f35516cff0f355129000f35512aff0f355125000f355126ff0f355119000f35511aff0f355115000f355116ff0f3550d3000f3550d5ff0f3550ce000f3550d0ff0f35503e000f3550c1ff0f35502f000f355031ff0f35502a000f35502cff0f200fe9000f200feaff0f200fe5000f200fe6ff0f200fd9000f200fdaff0f200fd5000f200fd6ff0f200f93000f200f95ff0f200f8e000f200f90ff0f200c7e000f200f81ff0f200c6f000f200c71ff0f200c6a000f200c6cff0f200c29000f200c2aff0f200c25000f200c26ff0f200c19000f200c1aff0f200c15000f200c16ff0f200bd3000f200bd5ff0f200bce000f200bd0ff0f200b3e000f200bc1ff0f200b2f000f200b31ff0f200b2a000f200b2cff0f200ae9000f200aeaff0f200ae5000f200ae6ff0f200ad9000f200adaff0f200ad5000f200ad6ff0f200a7f000f200a80ff0f200a7a000f200a7cff0f200a43000f200a45ff0f2009bf000f200a40ff0f2009ba000f2009bcff0f200983000f200985ff0f20097f000f200980ff0f20097a000f20097cff0f200943000f200945ff0f2006bf000f200940ff0f2006ba000f2006bcff0f200683000f200685ff0f20067f000f200680ff0f20067a000f20067cff0f200643000f200645ff0f2005bf000f200640ff0f2005ba000f2005bcff0f200583000f200585ff0f20057f000f200580ff0f20057a000f20057cff0f200543000f200545ff0f1ffabf000f200540ff0f1ffaba000f1ffabcff0f1ffa83000f1ffa85ff0f1ffa7f000f1ffa80ff0f1ffa7a000f1ffa7cff0f1ffa43000f1ffa45ff0f1ff9bf000f1ffa40ff0f1ff9ba000f1ff9bcff0f1ff983000f1ff985ff0f1ff97f000f1ff980ff0f1ff97a000f1ff97cff0f1ff943000f1ff945ff0f1ff6bf000f1ff940ff0f1ff6ba000f1ff6bcff0f1ff683000f1ff685ff0f1ff67f000f1ff680ff0f1ff67a000f1ff67cff0f1ff643000f1ff645ff0f1ff5bf000f1ff640ff0f1ff5ba000f1ff5bcff0f1ff583000f1ff585ff0f1ff57f000f1ff580ff0f1ff529000f1ff52aff0f1ff525000f1ff526ff0f1ff519000f1ff51aff0f1ff515000f1ff516ff0f1ff4d3000f1ff4d5ff0f1ff4ce000f1ff4d0ff0f1ff43e000f1ff4c1ff0f1ff42f000f1ff431ff0f1ff42a000f1ff42cff0f1ff3e9000f1ff3eaff0f1ff3e5000f1ff3e6ff0f1ff3d9000f1ff3daff0f1ff3d5000f1ff3d6ff0f1ff393000f1ff395ff0f1ff38e000f1ff390ff0f1ff07e000f1ff381ff0f1ff06f000f1ff071ff0f1ff06a000f1ff06cff0f1ff029000f1ff02aff0f1ff025000f1ff026ff0f1ff019000f1ff01aff0f1ff015000f1ff016ff0f0aafd3000f0aafd5ff0f0aafce000f0aafd0ff0f0aaf3e000f0aafc1ff0f0aaf2f000f0aaf31ff0f0aaf2a000f0aaf2cff0f0aaee9000f0aaeeaff0f0aaee5000f0aaee6ff0f0aaed9000f0aaedaff0f0aaed5000f0aaed6ff0f0aae93000f0aae95ff0f0aae8e000f0aae90ff0f0aa57e000f0aae81ff0f0aa56f000f0aa571ff0f0aa56a000f0aa56cff0f0aa529000f0aa52aff0f0aa525000f0aa526ff0f0aa519000f0aa51aff0f0aa515000f0aa516ff0f0aa4d3000f0aa4d5ff0f0aa4ce000f0aa4d0ff0f0aa43e000f0aa4c1ff0f0aa42f000f0aa431ff0f0aa42a000f0aa42cff0f0aa3e9000f0aa3eaff0f0aa3e5000f0aa3e6ff0f0aa3d9000f0aa3daff0f0aa3d5000f0aa3d6ff0f0aa393000f0aa395ff0f0aa38e000f0aa390ff0f0aa17e000f0aa381ff0f0aa16f000f0aa171ff0f0aa16a000f0aa16cff0f0aa129000f0aa12aff0f0aa125000f0aa126ff0f0aa119000f0aa11aff0f0aa115000f0aa116ff0f0aa0d3000f0aa0d5ff0f0aa0ce000f0aa0d0ff0f0aa07e000f0aa0c1ff0f0aa06f000f0aa071ff0f0aa06a000f0aa06cff0f0aa02a000f0aa02aff10d57f540010d57f57ff10d57e4c0010d57e57ff10d57e380010d57e43ff10d57cf80010d57e07ff10d57cbc0010d57cc7ff10d57ca80010d57cb3ff10d57ba40010d57babff10d57b940010d57b9bff10d57b640010d57b6bff10d57b540010d57b5bff10d57a4c0010d57a57ff10d57a380010d57a43ff10d571f80010d57a07ff10d571bc0010d571c7ff10d571a80010d571b3ff10d570a40010d570abff10d570940010d5709bff10d570640010d5706bff10d570540010d5705bff10d56f4c0010d56f57ff10d56f380010d56f43ff10d56cf80010d56f07ff10d56cbc0010d56cc7ff10d56ca80010d56cb3ff10d56ba40010d56babff10d56b940010d56b9bff10d56b640010d56b6bff10d56b540010d56b5bff10d56a4c0010d56a57ff10d56a380010d56a43ff10d545f80010d56a07ff10d545bc0010d545c7ff10d545a80010d545b3ff10d544a40010d544abff10d544940010d5449bff10d544640010d5446bff10d544540010d5445bff10d5434c0010d54357ff10d543380010d54343ff10d540f80010d54307ff10d540bc0010d540c7ff10d540a80010d540b3ff10803fa40010803fabff10803f940010803f9bff10803f640010803f6bff10803f540010803f5bff10803e4c0010803e57ff10803e380010803e43ff108031f80010803e07ff108031bc00108031c7ff108031a800108031b3ff108030a400108030abff10803094001080309bff10803064001080306bff10803054001080305bff10802f4c0010802f57ff10802f380010802f43ff10802cf80010802f07ff10802cbc0010802cc7ff10802ca80010802cb3ff10802ba40010802babff10802b940010802b9bff10802b640010802b6bff10802b540010802b5bff108029fc0010802a03ff108029e800108029f3ff1080290c0010802917ff108026fc0010802903ff108026e800108026f3ff1080260c0010802617ff108025fc0010802603ff108025e800108025f3ff1080250c0010802517ff10801afc0010802503ff10801ae80010801af3ff10801a0c0010801a17ff108019fc0010801a03ff108019e800108019f3ff1080190c0010801917ff108016fc0010801903ff108016e800108016f3ff1080160c0010801617ff108015fc0010801603ff108015e800108015f3ff1080150c0010801517ff107feafc0010801503ff107feae800107feaf3ff107fea0c00107fea17ff107fe9fc00107fea03ff107fe9e800107fe9f3ff107fe90c00107fe917ff107fe6fc00107fe903ff107fe6e800107fe6f3ff107fe60c00107fe617ff107fe5fc00107fe603ff107fe5e800107fe5f3ff107fe50c00107fe517ff107fdafc00107fe503ff107fdae800107fdaf3ff107fda0c00107fda17ff107fd9fc00107fda03ff107fd9e800107fd9f3ff107fd90c00107fd917ff107fd6fc00107fd903ff107fd6e800107fd6f3ff107fd60c00107fd617ff107fd5fc00107fd603ff107fd4a400107fd4abff107fd49400107fd49bff107fd46400107fd46bff107fd45400107fd45bff107fd34c00107fd357ff107fd33800107fd343ff107fd0f800107fd307ff107fd0bc00107fd0c7ff107fd0a800107fd0b3ff107fcfa400107fcfabff107fcf9400107fcf9bff107fcf6400107fcf6bff107fcf5400107fcf5bff107fce4c00107fce57ff107fce3800107fce43ff107fc1f800107fce07ff107fc1bc00107fc1c7ff107fc1a800107fc1b3ff107fc0a400107fc0abff107fc09400107fc09bff107fc06400107fc06bff107fc05400107fc05bff102abf4c00102abf57ff102abf3800102abf43ff102abcf800102abf07ff102abcbc00102abcc7ff102abca800102abcb3ff102abba400102abbabff102abb9400102abb9bff102abb6400102abb6bff102abb5400102abb5bff102aba4c00102aba57ff102aba3800102aba43ff102a95f800102aba07ff102a95bc00102a95c7ff102a95a800102a95b3ff102a94a400102a94abff102a949400102a949bff102a946400102a946bff102a945400102a945bff102a934c00102a9357ff102a933800102a9343ff102a90f800102a9307ff102a90bc00102a90c7ff102a90a800102a90b3ff102a8fa400102a8fabff102a8f9400102a8f9bff102a8f6400102a8f6bff102a8f5400102a8f5bff102a8e4c00102a8e57ff102a8e3800102a8e43ff102a85f800102a8e07ff102a85bc00102a85c7ff102a85a800102a85b3ff102a84a400102a84abff102a849400102a849bff102a846400102a846bff102a845400102a845bff102a834c00102a8357ff102a833800102a8343ff102a81f800102a8307ff102a81bc00102a81c7ff102a81a800102a81b3ff102a80a800102a80abff110355fd5000110355fd5fff110355f93000110355f95fff110355f8e000110355f90fff110355f3e000110355f81fff110355f2f000110355f31fff110355f2a000110355f2cfff110355ee9000110355eeafff110355ee5000110355ee6fff110355ed9000110355edafff110355ed5000110355ed6fff110355e93000110355e95fff110355e8e000110355e90fff110355c7e000110355e81fff110355c6f000110355c71fff110355c6a000110355c6cfff110355c29000110355c2afff110355c25000110355c26fff110355c19000110355c1afff110355c15000110355c16fff110355bd3000110355bd5fff110355bce000110355bd0fff110355b3e000110355bc1fff110355b2f000110355b31fff110355b2a000110355b2cfff110355ae9000110355aeafff110355ae5000110355ae6fff110355ad9000110355adafff110355ad5000110355ad6fff110355a93000110355a95fff110355a8e000110355a90fff11035517e000110355a81fff11035516f000110355171fff11035516a00011035516cfff11035512900011035512afff110355125000110355126fff11035511900011035511afff110355115000110355116fff1103550d30001103550d5fff1103550ce0001103550d0fff11035503e0001103550c1fff11035502f000110355031fff11035502a00011035502cfff110200fe9000110200feafff110200fe5000110200fe6fff110200fd9000110200fdafff110200fd5000110200fd6fff110200f93000110200f95fff110200f8e000110200f90fff110200c7e000110200f81fff110200c6f000110200c71fff110200c6a000110200c6cfff110200c29000110200c2afff110200c25000110200c26fff110200c19000110200c1afff110200c15000110200c16fff110200bd3000110200bd5fff110200bce000110200bd0fff110200b3e000110200bc1fff110200b2f000110200b31fff110200b2a000110200b2cfff110200ae9000110200aeafff110200ae5000110200ae6fff110200ad9000110200adafff110200ad5000110200ad6fff110200a7f000110200a80fff110200a7a000110200a7cfff110200a43000110200a45fff1102009bf000110200a40fff1102009ba0001102009bcfff110200983000110200985fff11020097f000110200980fff11020097a00011020097cfff110200943000110200945fff1102006bf000110200940fff1102006ba0001102006bcfff110200683000110200685fff11020067f000110200680fff11020067a00011020067cfff110200643000110200645fff1102005bf000110200640fff1102005ba0001102005bcfff110200583000110200585fff11020057f000110200580fff11020057a00011020057cfff110200543000110200545fff1101ffabf000110200540fff1101ffaba0001101ffabcfff1101ffa830001101ffa85fff1101ffa7f0001101ffa80fff1101ffa7a0001101ffa7cfff1101ffa430001101ffa45fff1101ff9bf0001101ffa40fff1101ff9ba0001101ff9bcfff1101ff9830001101ff985fff1101ff97f0001101ff980fff1101ff97a0001101ff97cfff1101ff9430001101ff945fff1101ff6bf0001101ff940fff1101ff6ba0001101ff6bcfff1101ff6830001101ff685fff1101ff67f0001101ff680fff1101ff67a0001101ff67cfff1101ff6430001101ff645fff1101ff5bf0001101ff640fff1101ff5ba0001101ff5bcfff1101ff5830001101ff585fff1101ff57f0001101ff580fff1101ff5290001101ff52afff1101ff5250001101ff526fff1101ff5190001101ff51afff1101ff5150001101ff516fff1101ff4d30001101ff4d5fff1101ff4ce0001101ff4d0fff1101ff43e0001101ff4c1fff1101ff42f0001101ff431fff1101ff42a0001101ff42cfff1101ff3e90001101ff3eafff1101ff3e50001101ff3e6fff1101ff3d90001101ff3dafff1101ff3d50001101ff3d6fff1101ff3930001101ff395fff1101ff38e0001101ff390fff1101ff07e0001101ff381fff1101ff06f0001101ff071fff1101ff06a0001101ff06cfff1101ff0290001101ff02afff1101ff0250001101ff026fff1101ff0190001101ff01afff1101ff0150001101ff016fff1100aafd30001100aafd5fff1100aafce0001100aafd0fff1100aaf3e0001100aafc1fff1100aaf2f0001100aaf31fff1100aaf2a0001100aaf2cfff1100aaee90001100aaeeafff1100aaee50001100aaee6fff1100aaed90001100aaedafff1100aaed50001100aaed6fff1100aae930001100aae95fff1100aae8e0001100aae90fff1100aa57e0001100aae81fff1100aa56f0001100aa571fff1100aa56a0001100aa56cfff1100aa5290001100aa52afff1100aa5250001100aa526fff1100aa5190001100aa51afff1100aa5150001100aa516fff1100aa4d30001100aa4d5fff1100aa4ce0001100aa4d0fff1100aa43e0001100aa4c1fff1100aa42f0001100aa431fff1100aa42a0001100aa42cfff1100aa3e90001100aa3eafff1100aa3e50001100aa3e6fff1100aa3d90001100aa3dafff1100aa3d50001100aa3d6fff1100aa3930001100aa395fff1100aa38e0001100aa390fff1100aa17e0001100aa381fff1100aa16f0001100aa171fff1100aa16a0001100aa16cfff1100aa1290001100aa12afff1100aa1250001100aa126fff1100aa1190001100aa11afff1100aa1150001100aa116fff1100aa0d30001100aa0d5fff1100aa0ce0001100aa0d0fff1100aa07e0001100aa0c1fff1100aa06f0001100aa071fff1100aa06a0001100aa06cfff1100aa02a0001100aa02afff120d57f54000120d57f57fff120d57e4c000120d57e57fff120d57e38000120d57e43fff120d57cf8000120d57e07fff120d57cbc000120d57cc7fff120d57ca8000120d57cb3fff120d57ba4000120d57babfff120d57b94000120d57b9bfff120d57b64000120d57b6bfff120d57b54000120d57b5bfff120d57a4c000120d57a57fff120d57a38000120d57a43fff120d571f8000120d57a07fff120d571bc000120d571c7fff120d571a8000120d571b3fff120d570a4000120d570abfff120d57094000120d5709bfff120d57064000120d5706bfff120d57054000120d5705bfff120d56f4c000120d56f57fff120d56f38000120d56f43fff120d56cf8000120d56f07fff120d56cbc000120d56cc7fff120d56ca8000120d56cb3fff120d56ba4000120d56babfff120d56b94000120d56b9bfff120d56b64000120d56b6bfff120d56b54000120d56b5bfff120d56a4c000120d56a57fff120d56a38000120d56a43fff120d545f8000120d56a07fff120d545bc000120d545c7fff120d545a8000120d545b3fff120d544a4000120d544abfff120d54494000120d5449bfff120d54464000120d5446bfff120d54454000120d5445bfff120d5434c000120d54357fff120d54338000120d54343fff120d540f8000120d54307fff120d540bc000120d540c7fff120d540a8000120d540b3fff120803fa4000120803fabfff120803f94000120803f9bfff120803f64000120803f6bfff120803f54000120803f5bfff120803e4c000120803e57fff120803e38000120803e43fff1208031f8000120803e07fff1208031bc0001208031c7fff1208031a80001208031b3fff1208030a40001208030abfff12080309400012080309bfff12080306400012080306bfff12080305400012080305bfff120802f4c000120802f57fff120802f38000120802f43fff120802cf8000120802f07fff120802cbc000120802cc7fff120802ca8000120802cb3fff120802ba4000120802babfff120802b94000120802b9bfff120802b64000120802b6bfff120802b54000120802b5bfff1208029fc000120802a03fff1208029e80001208029f3fff12080290c000120802917fff1208026fc000120802903fff1208026e80001208026f3fff12080260c000120802617fff1208025fc000120802603fff1208025e80001208025f3fff12080250c000120802517fff120801afc000120802503fff120801ae8000120801af3fff120801a0c000120801a17fff1208019fc000120801a03fff1208019e80001208019f3fff12080190c000120801917fff1208016fc000120801903fff1208016e80001208016f3fff12080160c000120801617fff1208015fc000120801603fff1208015e80001208015f3fff12080150c000120801517fff1207feafc000120801503fff1207feae80001207feaf3fff1207fea0c0001207fea17fff1207fe9fc0001207fea03fff1207fe9e80001207fe9f3fff1207fe90c0001207fe917fff1207fe6fc0001207fe903fff1207fe6e80001207fe6f3fff1207fe60c0001207fe617fff1207fe5fc0001207fe603fff1207fe5e80001207fe5f3fff1207fe50c0001207fe517fff1207fdafc0001207fe503fff1207fdae80001207fdaf3fff1207fda0c0001207fda17fff1207fd9fc0001207fda03fff1207fd9e80001207fd9f3fff1207fd90c0001207fd917fff1207fd6fc0001207fd903fff1207fd6e80001207fd6f3fff1207fd60c0001207fd617fff1207fd5fc0001207fd603fff1207fd4a40001207fd4abfff1207fd4940001207fd49bfff1207fd4640001207fd46bfff1207fd4540001207fd45bfff1207fd34c0001207fd357fff1207fd3380001207fd343fff1207fd0f80001207fd307fff1207fd0bc0001207fd0c7fff1207fd0a80001207fd0b3fff1207fcfa40001207fcfabfff1207fcf940001207fcf9bfff1207fcf640001207fcf6bfff1207fcf540001207fcf5bfff1207fce4c0001207fce57fff1207fce380001207fce43fff1207fc1f80001207fce07fff1207fc1bc0001207fc1c7fff1207fc1a80001207fc1b3fff1207fc0a40001207fc0abfff1207fc0940001207fc09bfff1207fc0640001207fc06bfff1207fc0540001207fc05bfff1202abf4c0001202abf57fff1202abf380001202abf43fff1202abcf80001202abf07fff1202abcbc0001202abcc7fff1202abca80001202abcb3fff1202abba40001202abbabfff1202abb940001202abb9bfff1202abb640001202abb6bfff1202abb540001202abb5bfff1202aba4c0001202aba57fff1202aba380001202aba43fff1202a95f80001202aba07fff1202a95bc0001202a95c7fff1202a95a80001202a95b3fff1202a94a40001202a94abfff1202a94940001202a949bfff1202a94640001202a946bfff1202a94540001202a945bfff1202a934c0001202a9357fff1202a93380001202a9343fff1202a90f80001202a9307fff1202a90bc0001202a90c7fff1202a90a80001202a90b3fff1202a8fa40001202a8fabfff1202a8f940001202a8f9bfff1202a8f640001202a8f6bfff1202a8f540001202a8f5bfff1202a8e4c0001202a8e57fff1202a8e380001202a8e43fff1202a85f80001202a8e07fff1202a85bc0001202a85c7fff1202a85a80001202a85b3fff1202a84a40001202a84abfff1202a84940001202a849bfff1202a84640001202a846bfff1202a84540001202a845bfff1202a834c0001202a8357fff1202a83380001202a8343fff1202a81f80001202a8307fff1202a81bc0001202a81c7fff1202a81a80001202a81b3fff1202a80a80001202a80abfff13355fd5000013355fd5ffff13355f93000013355f95ffff13355f8e000013355f90ffff13355f3e000013355f81ffff13355f2f000013355f31ffff13355f2a000013355f2cffff13355ee9000013355eeaffff13355ee5000013355ee6ffff13355ed9000013355edaffff13355ed5000013355ed6ffff13355e93000013355e95ffff13355e8e000013355e90ffff13355c7e000013355e81ffff13355c6f000013355c71ffff13355c6a000013355c6cffff13355c29000013355c2affff13355c25000013355c26ffff13355c19000013355c1affff13355c15000013355c16ffff13355bd3000013355bd5ffff13355bce000013355bd0ffff13355b3e000013355bc1ffff13355b2f000013355b31ffff13355b2a000013355b2cffff13355ae9000013355aeaffff13355ae5000013355ae6ffff13355ad9000013355adaffff13355ad5000013355ad6ffff13355a93000013355a95ffff13355a8e000013355a90ffff1335517e000013355a81ffff1335516f000013355171ffff1335516a00001335516cffff1335512900001335512affff13355125000013355126ffff1335511900001335511affff13355115000013355116ffff133550d30000133550d5ffff133550ce0000133550d0ffff1335503e0000133550c1ffff1335502f000013355031ffff1335502a00001335502cffff13200fe9000013200feaffff13200fe5000013200fe6ffff13200fd9000013200fdaffff13200fd5000013200fd6ffff13200f93000013200f95ffff13200f8e000013200f90ffff13200c7e000013200f81ffff13200c6f000013200c71ffff13200c6a000013200c6cffff13200c29000013200c2affff13200c25000013200c26ffff13200c19000013200c1affff13200c15000013200c16ffff13200bd3000013200bd5ffff13200bce000013200bd0ffff13200b3e000013200bc1ffff13200b2f000013200b31ffff13200b2a000013200b2cffff13200ae9000013200aeaffff13200ae5000013200ae6ffff13200ad9000013200adaffff13200ad5000013200ad6ffff13200a7f000013200a80ffff13200a7a000013200a7cffff13200a43000013200a45ffff132009bf000013200a40ffff132009ba0000132009bcffff13200983000013200985ffff1320097f000013200980ffff1320097a00001320097cffff13200943000013200945ffff132006bf000013200940ffff132006ba0000132006bcffff13200683000013200685ffff1320067f000013200680ffff1320067a00001320067cffff13200643000013200645ffff132005bf000013200640ffff132005ba0000132005bcffff13200583000013200585ffff1320057f000013200580ffff1320057a00001320057cffff13200543000013200545ffff131ffabf000013200540ffff131ffaba0000131ffabcffff131ffa830000131ffa85ffff131ffa7f0000131ffa80ffff131ffa7a0000131ffa7cffff131ffa430000131ffa45ffff131ff9bf0000131ffa40ffff131ff9ba0000131ff9bcffff131ff9830000131ff985ffff131ff97f0000131ff980ffff131ff97a0000131ff97cffff131ff9430000131ff945ffff131ff6bf0000131ff940ffff131ff6ba0000131ff6bcffff131ff6830000131ff685ffff131ff67f0000131ff680ffff131ff67a0000131ff67cffff131ff6430000131ff645ffff131ff5bf0000131ff640ffff131ff5ba0000131ff5bcffff131ff5830000131ff585ffff131ff57f0000131ff580ffff131ff5290000131ff52affff131ff5250000131ff526ffff131ff5190000131ff51affff131ff5150000131ff516ffff131ff4d30000131ff4d5ffff131ff4ce0000131ff4d0ffff131ff43e0000131ff4c1ffff131ff42f0000131ff431ffff131ff42a0000131ff42cffff131ff3e90000131ff3eaffff131ff3e50000131ff3e6ffff131ff3d90000131ff3daffff131ff3d50000131ff3d6ffff131ff3930000131ff395ffff131ff38e0000131ff390ffff131ff07e0000131ff381ffff131ff06f0000131ff071ffff131ff06a0000131ff06cffff131ff0290000131ff02affff131ff0250000131ff026ffff131ff0190000131ff01affff131ff0150000131ff016ffff130aafd30000130aafd5ffff130aafce0000130aafd0ffff130aaf3e0000130aafc1ffff130aaf2f0000130aaf31ffff130aaf2a0000130aaf2cffff130aaee90000130aaeeaffff130aaee50000130aaee6ffff130aaed90000130aaedaffff130aaed50000130aaed6ffff130aae930000130aae95ffff130aae8e0000130aae90ffff130aa57e0000130aae81ffff130aa56f0000130aa571ffff130aa56a0000130aa56cffff130aa5290000130aa52affff130aa5250000130aa526ffff130aa5190000130aa51affff130aa5150000130aa516ffff130aa4d30000130aa4d5ffff130aa4ce0000130aa4d0ffff130aa43e0000130aa4c1ffff130aa42f0000130aa431ffff130aa42a0000130aa42cffff130aa3e90000130aa3eaffff130aa3e50000130aa3e6ffff130aa3d90000130aa3daffff130aa3d50000130aa3d6ffff130aa3930000130aa395ffff130aa38e0000130aa390ffff130aa17e0000130aa381ffff130aa16f0000130aa171ffff130aa16a0000130aa16cffff130aa1290000130aa12affff130aa1250000130aa126ffff130aa1190000130aa11affff130aa1150000130aa116ffff130aa0d30000130aa0d5ffff130aa0ce0000130aa0d0ffff130aa07e0000130aa0c1ffff130aa06f0000130aa071ffff130aa06a0000130aa06cffff130aa02a0000130aa02affff14d57f54000014d57f57ffff14d57e4c000014d57e57ffff14d57e38000014d57e43ffff14d57cf8000014d57e07ffff14d57cbc000014d57cc7ffff14d57ca8000014d57cb3ffff14d57ba4000014d57babffff14d57b94000014d57b9bffff14d57b64000014d57b6bffff14d57b54000014d57b5bffff14d57a4c000014d57a57ffff14d57a38000014d57a43ffff14d571f8000014d57a07ffff14d571bc000014d571c7ffff14d571a8000014d571b3ffff14d570a4000014d570abffff14d57094000014d5709bffff14d57064000014d5706bffff14d57054000014d5705bffff14d56f4c000014d56f57ffff14d56f38000014d56f43ffff14d56cf8000014d56f07ffff14d56cbc000014d56cc7ffff14d56ca8000014d56cb3ffff14d56ba4000014d56babffff14d56b94000014d56b9bffff14d56b64000014d56b6bffff14d56b54000014d56b5bffff14d56a4c000014d56a57ffff14d56a38000014d56a43ffff14d545f8000014d56a07ffff14d545bc000014d545c7ffff14d545a8000014d545b3ffff14d544a4000014d544abffff14d54494000014d5449bffff14d54464000014d5446bffff14d54454000014d5445bffff14d5434c000014d54357ffff14d54338000014d54343ffff14d540f8000014d54307ffff14d540bc000014d540c7ffff14d540a8000014d540b3ffff14803fa4000014803fabffff14803f94000014803f9bffff14803f64000014803f6bffff14803f54000014803f5bffff14803e4c000014803e57ffff14803e38000014803e43ffff148031f8000014803e07ffff148031bc0000148031c7ffff148031a80000148031b3ffff148030a40000148030abffff1480309400001480309bffff1480306400001480306bffff1480305400001480305bffff14802f4c000014802f57ffff14802f38000014802f43ffff14802cf8000014802f07ffff14802cbc000014802cc7ffff14802ca8000014802cb3ffff14802ba4000014802babffff14802b94000014802b9bffff14802b64000014802b6bffff14802b54000014802b5bffff148029fc000014802a03ffff148029e80000148029f3ffff1480290c000014802917ffff148026fc000014802903ffff148026e80000148026f3ffff1480260c000014802617ffff148025fc000014802603ffff148025e80000148025f3ffff1480250c000014802517ffff14801afc000014802503ffff14801ae8000014801af3ffff14801a0c000014801a17ffff148019fc000014801a03ffff148019e80000148019f3ffff1480190c000014801917ffff148016fc000014801903ffff148016e80000148016f3ffff1480160c000014801617ffff148015fc000014801603ffff148015e80000148015f3ffff1480150c000014801517ffff147feafc000014801503ffff147feae80000147feaf3ffff147fea0c0000147fea17ffff147fe9fc0000147fea03ffff147fe9e80000147fe9f3ffff147fe90c0000147fe917ffff147fe6fc0000147fe903ffff147fe6e80000147fe6f3ffff147fe60c0000147fe617ffff147fe5fc0000147fe603ffff147fe5e80000147fe5f3ffff147fe50c0000147fe517ffff147fdafc0000147fe503ffff147fdae80000147fdaf3ffff147fda0c0000147fda17ffff147fd9fc0000147fda03ffff147fd9e80000147fd9f3ffff147fd90c0000147fd917ffff147fd6fc0000147fd903ffff147fd6e80000147fd6f3ffff147fd60c0000147fd617ffff147fd5fc0000147fd603ffff147fd4a40000147fd4abffff147fd4940000147fd49bffff147fd4640000147fd46bffff147fd4540000147fd45bffff147fd34c0000147fd357ffff147fd3380000147fd343ffff147fd0f80000147fd307ffff147fd0bc0000147fd0c7ffff147fd0a80000147fd0b3ffff147fcfa40000147fcfabffff147fcf940000147fcf9bffff147fcf640000147fcf6bffff147fcf540000147fcf5bffff147fce4c0000147fce57ffff147fce380000147fce43ffff147fc1f80000147fce07ffff147fc1bc0000147fc1c7ffff147fc1a80000147fc1b3ffff147fc0a40000147fc0abffff147fc0940000147fc09bffff147fc0640000147fc06bffff147fc0540000147fc05bffff142abf4c0000142abf57ffff142abf380000142abf43ffff142abcf80000142abf07ffff142abcbc0000142abcc7ffff142abca80000142abcb3ffff142abba40000142abbabffff142abb940000142abb9bffff142abb640000142abb6bffff142abb540000142abb5bffff142aba4c0000142aba57ffff142aba380000142aba43ffff142a95f80000142aba07ffff142a95bc0000142a95c7ffff142a95a80000142a95b3ffff142a94a40000142a94abffff142a94940000142a949bffff142a94640000142a946bffff142a94540000142a945bffff142a934c0000142a9357ffff142a93380000142a9343ffff142a90f80000142a9307ffff142a90bc0000142a90c7ffff142a90a80000142a90b3ffff142a8fa40000142a8fabffff142a8f940000142a8f9bffff142a8f640000142a8f6bffff142a8f540000142a8f5bffff142a8e4c0000142a8e57ffff142a8e380000142a8e43ffff142a85f80000142a8e07ffff142a85bc0000142a85c7ffff142a85a80000142a85b3ffff142a84a40000142a84abffff142a84940000142a849bffff142a84640000142a846bffff142a84540000142a845bffff142a834c0000142a8357ffff142a83380000142a8343ffff142a81f80000142a8307ffff142a81bc0000142a81c7ffff142a81a80000142a81b3ffff142a80a80000142a80abffff150355fd500000150355fd5fffff150355f9300000150355f95fffff150355f8e00000150355f90fffff150355f3e00000150355f81fffff150355f2f00000150355f31fffff150355f2a00000150355f2cfffff150355ee900000150355eeafffff150355ee500000150355ee6fffff150355ed900000150355edafffff150355ed500000150355ed6fffff150355e9300000150355e95fffff150355e8e00000150355e90fffff150355c7e00000150355e81fffff150355c6f00000150355c71fffff150355c6a00000150355c6cfffff150355c2900000150355c2afffff150355c2500000150355c26fffff150355c1900000150355c1afffff150355c1500000150355c16fffff150355bd300000150355bd5fffff150355bce00000150355bd0fffff150355b3e00000150355bc1fffff150355b2f00000150355b31fffff150355b2a00000150355b2cfffff150355ae900000150355aeafffff150355ae500000150355ae6fffff150355ad900000150355adafffff150355ad500000150355ad6fffff150355a9300000150355a95fffff150355a8e00000150355a90fffff15035517e00000150355a81fffff15035516f00000150355171fffff15035516a0000015035516cfffff1503551290000015035512afffff15035512500000150355126fffff1503551190000015035511afffff15035511500000150355116fffff1503550d3000001503550d5fffff1503550ce000001503550d0fffff15035503e000001503550c1fffff15035502f00000150355031fffff15035502a0000015035502cfffff150200fe900000150200feafffff150200fe500000150200fe6fffff150200fd900000150200fdafffff150200fd500000150200fd6fffff150200f9300000150200f95fffff150200f8e00000150200f90fffff150200c7e00000150200f81fffff150200c6f00000150200c71fffff150200c6a00000150200c6cfffff150200c2900000150200c2afffff150200c2500000150200c26fffff150200c1900000150200c1afffff150200c1500000150200c16fffff150200bd300000150200bd5fffff150200bce00000150200bd0fffff150200b3e00000150200bc1fffff150200b2f00000150200b31fffff150200b2a00000150200b2cfffff150200ae900000150200aeafffff150200ae500000150200ae6fffff150200ad900000150200adafffff150200ad500000150200ad6fffff150200a7f00000150200a80fffff150200a7a00000150200a7cfffff150200a4300000150200a45fffff1502009bf00000150200a40fffff1502009ba000001502009bcfffff15020098300000150200985fffff15020097f00000150200980fffff15020097a0000015020097cfffff15020094300000150200945fffff1502006bf00000150200940fffff1502006ba000001502006bcfffff15020068300000150200685fffff15020067f00000150200680fffff15020067a0000015020067cfffff15020064300000150200645fffff1502005bf00000150200640fffff1502005ba000001502005bcfffff15020058300000150200585fffff15020057f00000150200580fffff15020057a0000015020057cfffff15020054300000150200545fffff1501ffabf00000150200540fffff1501ffaba000001501ffabcfffff1501ffa83000001501ffa85fffff1501ffa7f000001501ffa80fffff1501ffa7a000001501ffa7cfffff1501ffa43000001501ffa45fffff1501ff9bf000001501ffa40fffff1501ff9ba000001501ff9bcfffff1501ff983000001501ff985fffff1501ff97f000001501ff980fffff1501ff97a000001501ff97cfffff1501ff943000001501ff945fffff1501ff6bf000001501ff940fffff1501ff6ba000001501ff6bcfffff1501ff683000001501ff685fffff1501ff67f000001501ff680fffff1501ff67a000001501ff67cfffff1501ff643000001501ff645fffff1501ff5bf000001501ff640fffff1501ff5ba000001501ff5bcfffff1501ff583000001501ff585fffff1501ff57f000001501ff580fffff1501ff529000001501ff52afffff1501ff525000001501ff526fffff1501ff519000001501ff51afffff1501ff515000001501ff516fffff1501ff4d3000001501ff4d5fffff1501ff4ce000001501ff4d0fffff1501ff43e000001501ff4c1fffff1501ff42f000001501ff431fffff1501ff42a000001501ff42cfffff1501ff3e9000001501ff3eafffff1501ff3e5000001501ff3e6fffff1501ff3d9000001501ff3dafffff1501ff3d5000001501ff3d6fffff1501ff393000001501ff395fffff1501ff38e000001501ff390fffff1501ff07e000001501ff381fffff1501ff06f000001501ff071fffff1501ff06a000001501ff06cfffff1501ff029000001501ff02afffff1501ff025000001501ff026fffff1501ff019000001501ff01afffff1501ff015000001501ff016fffff1500aafd3000001500aafd5fffff1500aafce000001500aafd0fffff1500aaf3e000001500aafc1fffff1500aaf2f000001500aaf31fffff1500aaf2a000001500aaf2cfffff1500aaee9000001500aaeeafffff1500aaee5000001500aaee6fffff1500aaed9000001500aaedafffff1500aaed5000001500aaed6fffff1500aae93000001500aae95fffff1500aae8e000001500aae90fffff1500aa57e000001500aae81fffff1500aa56f000001500aa571fffff1500aa56a000001500aa56cfffff1500aa529000001500aa52afffff1500aa525000001500aa526fffff1500aa519000001500aa51afffff1500aa515000001500aa516fffff1500aa4d3000001500aa4d5fffff1500aa4ce000001500aa4d0fffff1500aa43e000001500aa4c1fffff1500aa42f000001500aa431fffff1500aa42a000001500aa42cfffff1500aa3e9000001500aa3eafffff1500aa3e5000001500aa3e6fffff1500aa3d9000001500aa3dafffff1500aa3d5000001500aa3d6fffff1500aa393000001500aa395fffff1500aa38e000001500aa390fffff1500aa17e000001500aa381fffff1500aa16f000001500aa171fffff1500aa16a000001500aa16cfffff1500aa129000001500aa12afffff1500aa125000001500aa126fffff1500aa119000001500aa11afffff1500aa115000001500aa116fffff1500aa0d3000001500aa0d5fffff1500aa0ce000001500aa0d0fffff1500aa07e000001500aa0c1fffff1500aa06f000001500aa071fffff1500aa06a000001500aa06cfffff1500aa02a000001500aa02afffff160d57f5400000160d57f57fffff160d57e4c00000160d57e57fffff160d57e3800000160d57e43fffff160d57cf800000160d57e07fffff160d57cbc00000160d57cc7fffff160d57ca800000160d57cb3fffff160d57ba400000160d57babfffff160d57b9400000160d57b9bfffff160d57b6400000160d57b6bfffff160d57b5400000160d57b5bfffff160d57a4c00000160d57a57fffff160d57a3800000160d57a43fffff160d571f800000160d57a07fffff160d571bc00000160d571c7fffff160d571a800000160d571b3fffff160d570a400000160d570abfffff160d5709400000160d5709bfffff160d5706400000160d5706bfffff160d5705400000160d5705bfffff160d56f4c00000160d56f57fffff160d56f3800000160d56f43fffff160d56cf800000160d56f07fffff160d56cbc00000160d56cc7fffff160d56ca800000160d56cb3fffff160d56ba400000160d56babfffff160d56b9400000160d56b9bfffff160d56b6400000160d56b6bfffff160d56b5400000160d56b5bfffff160d56a4c00000160d56a57fffff160d56a3800000160d56a43fffff160d545f800000160d56a07fffff160d545bc00000160d545c7fffff160d545a800000160d545b3fffff160d544a400000160d544abfffff160d5449400000160d5449bfffff160d5446400000160d5446bfffff160d5445400000160d5445bfffff160d5434c00000160d54357fffff160d5433800000160d54343fffff160d540f800000160d54307fffff160d540bc00000160d540c7fffff160d540a800000160d540b3fffff160803fa400000160803fabfffff160803f9400000160803f9bfffff160803f6400000160803f6bfffff160803f5400000160803f5bfffff160803e4c00000160803e57fffff160803e3800000160803e43fffff1608031f800000160803e07fffff1608031bc000001608031c7fffff1608031a8000001608031b3fffff1608030a4000001608030abfffff1608030940000016080309bfffff1608030640000016080306bfffff1608030540000016080305bfffff160802f4c00000160802f57fffff160802f3800000160802f43fffff160802cf800000160802f07fffff160802cbc00000160802cc7fffff160802ca800000160802cb3fffff160802ba400000160802babfffff160802b9400000160802b9bfffff160802b6400000160802b6bfffff160802b5400000160802b5bfffff1608029fc00000160802a03fffff1608029e8000001608029f3fffff16080290c00000160802917fffff1608026fc00000160802903fffff1608026e8000001608026f3fffff16080260c00000160802617fffff1608025fc00000160802603fffff1608025e8000001608025f3fffff16080250c00000160802517fffff160801afc00000160802503fffff160801ae800000160801af3fffff160801a0c00000160801a17fffff1608019fc00000160801a03fffff1608019e8000001608019f3fffff16080190c00000160801917fffff1608016fc00000160801903fffff1608016e8000001608016f3fffff16080160c00000160801617fffff1608015fc00000160801603fffff1608015e8000001608015f3fffff16080150c00000160801517fffff1607feafc00000160801503fffff1607feae8000001607feaf3fffff1607fea0c000001607fea17fffff1607fe9fc000001607fea03fffff1607fe9e8000001607fe9f3fffff1607fe90c000001607fe917fffff1607fe6fc000001607fe903fffff1607fe6e8000001607fe6f3fffff1607fe60c000001607fe617fffff1607fe5fc000001607fe603fffff1607fe5e8000001607fe5f3fffff1607fe50c000001607fe517fffff1607fdafc000001607fe503fffff1607fdae8000001607fdaf3fffff1607fda0c000001607fda17fffff1607fd9fc000001607fda03fffff1607fd9e8000001607fd9f3fffff1607fd90c000001607fd917fffff1607fd6fc000001607fd903fffff1607fd6e8000001607fd6f3fffff1607fd60c000001607fd617fffff1607fd5fc000001607fd603fffff1607fd4a4000001607fd4abfffff1607fd494000001607fd49bfffff1607fd464000001607fd46bfffff1607fd454000001607fd45bfffff1607fd34c000001607fd357fffff1607fd338000001607fd343fffff1607fd0f8000001607fd307fffff1607fd0bc000001607fd0c7fffff1607fd0a8000001607fd0b3fffff1607fcfa4000001607fcfabfffff1607fcf94000001607fcf9bfffff1607fcf64000001607fcf6bfffff1607fcf54000001607fcf5bfffff1607fce4c000001607fce57fffff1607fce38000001607fce43fffff1607fc1f8000001607fce07fffff1607fc1bc000001607fc1c7fffff1607fc1a8000001607fc1b3fffff1607fc0a4000001607fc0abfffff1607fc094000001607fc09bfffff1607fc064000001607fc06bfffff1607fc054000001607fc05bfffff1602abf4c000001602abf57fffff1602abf38000001602abf43fffff1602abcf8000001602abf07fffff1602abcbc000001602abcc7fffff1602abca8000001602abcb3fffff1602abba4000001602abbabfffff1602abb94000001602abb9bfffff1602abb64000001602abb6bfffff1602abb54000001602abb5bfffff1602aba4c000001602aba57fffff1602aba38000001602aba43fffff1602a95f8000001602aba07fffff1602a95bc000001602a95c7fffff1602a95a8000001602a95b3fffff1602a94a4000001602a94abfffff1602a9494000001602a949bfffff1602a9464000001602a946bfffff1602a9454000001602a945bfffff1602a934c000001602a9357fffff1602a9338000001602a9343fffff1602a90f8000001602a9307fffff1602a90bc000001602a90c7fffff1602a90a8000001602a90b3fffff1602a8fa4000001602a8fabfffff1602a8f94000001602a8f9bfffff1602a8f64000001602a8f6bfffff1602a8f54000001602a8f5bfffff1602a8e4c000001602a8e57fffff1602a8e38000001602a8e43fffff1602a85f8000001602a8e07fffff1602a85bc000001602a85c7fffff1602a85a8000001602a85b3fffff1602a84a4000001602a84abfffff1602a8494000001602a849bfffff1602a8464000001602a846bfffff1602a8454000001602a845bfffff1602a834c000001602a8357fffff1602a8338000001602a8343fffff1602a81f8000001602a8307fffff1602a81bc000001602a81c7fffff1602a81a8000001602a81b3fffff1602a80a8000001602a80abfffff17355fd500000017355fd5ffffff17355f9300000017355f95ffffff17355f8e00000017355f90ffffff17355f3e00000017355f81ffffff17355f2f00000017355f31ffffff17355f2a00000017355f2cffffff17355ee900000017355eeaffffff17355ee500000017355ee6ffffff17355ed900000017355edaffffff17355ed500000017355ed6ffffff17355e9300000017355e95ffffff17355e8e00000017355e90ffffff17355c7e00000017355e81ffffff17355c6f00000017355c71ffffff17355c6a00000017355c6cffffff17355c2900000017355c2affffff17355c2500000017355c26ffffff17355c1900000017355c1affffff17355c1500000017355c16ffffff17355bd300000017355bd5ffffff17355bce00000017355bd0ffffff17355b3e00000017355bc1ffffff17355b2f00000017355b31ffffff17355b2a00000017355b2cffffff17355ae900000017355aeaffffff17355ae500000017355ae6ffffff17355ad900000017355adaffffff17355ad500000017355ad6ffffff17355a9300000017355a95ffffff17355a8e00000017355a90ffffff1735517e00000017355a81ffffff1735516f00000017355171ffffff1735516a0000001735516cffffff173551290000001735512affffff1735512500000017355126ffffff173551190000001735511affffff1735511500000017355116ffffff173550d3000000173550d5ffffff173550ce000000173550d0ffffff1735503e000000173550c1ffffff1735502f00000017355031ffffff1735502a0000001735502cffffff17200fe900000017200feaffffff17200fe500000017200fe6ffffff17200fd900000017200fdaffffff17200fd500000017200fd6ffffff17200f9300000017200f95ffffff17200f8e00000017200f90ffffff17200c7e00000017200f81ffffff17200c6f00000017200c71ffffff17200c6a00000017200c6cffffff17200c2900000017200c2affffff17200c2500000017200c26ffffff17200c1900000017200c1affffff17200c1500000017200c16ffffff17200bd300000017200bd5ffffff17200bce00000017200bd0ffffff17200b3e00000017200bc1ffffff17200b2f00000017200b31ffffff17200b2a00000017200b2cffffff17200ae900000017200aeaffffff17200ae500000017200ae6ffffff17200ad900000017200adaffffff17200ad500000017200ad6ffffff17200a7f00000017200a80ffffff17200a7a00000017200a7cffffff17200a4300000017200a45ffffff172009bf00000017200a40ffffff172009ba000000172009bcffffff1720098300000017200985ffffff1720097f00000017200980ffffff1720097a0000001720097cffffff1720094300000017200945ffffff172006bf00000017200940ffffff172006ba000000172006bcffffff1720068300000017200685ffffff1720067f00000017200680ffffff1720067a0000001720067cffffff1720064300000017200645ffffff172005bf00000017200640ffffff172005ba000000172005bcffffff1720058300000017200585ffffff1720057f00000017200580ffffff1720057a0000001720057cffffff1720054300000017200545ffffff171ffabf00000017200540ffffff171ffaba000000171ffabcffffff171ffa83000000171ffa85ffffff171ffa7f000000171ffa80ffffff171ffa7a000000171ffa7cffffff171ffa43000000171ffa45ffffff171ff9bf000000171ffa40ffffff171ff9ba000000171ff9bcffffff171ff983000000171ff985ffffff171ff97f000000171ff980ffffff171ff97a000000171ff97cffffff171ff943000000171ff945ffffff171ff6bf000000171ff940ffffff171ff6ba000000171ff6bcffffff171ff683000000171ff685ffffff171ff67f000000171ff680ffffff171ff67a000000171ff67cffffff171ff643000000171ff645ffffff171ff5bf000000171ff640ffffff171ff5ba000000171ff5bcffffff171ff583000000171ff585ffffff171ff57f000000171ff580ffffff171ff529000000171ff52affffff171ff525000000171ff526ffffff171ff519000000171ff51affffff171ff515000000171ff516ffffff171ff4d3000000171ff4d5ffffff171ff4ce000000171ff4d0ffffff171ff43e000000171ff4c1ffffff171ff42f000000171ff431ffffff171ff42a000000171ff42cffffff171ff3e9000000171ff3eaffffff171ff3e5000000171ff3e6ffffff171ff3d9000000171ff3daffffff171ff3d5000000171ff3d6ffffff171ff393000000171ff395ffffff171ff38e000000171ff390ffffff171ff07e000000171ff381ffffff171ff06f000000171ff071ffffff171ff06a000000171ff06cffffff171ff029000000171ff02affffff171ff025000000171ff026ffffff171ff019000000171ff01affffff171ff015000000171ff016ffffff170aafd3000000170aafd5ffffff170aafce000000170aafd0ffffff170aaf3e000000170aafc1ffffff170aaf2f000000170aaf31ffffff170aaf2a000000170aaf2cffffff170aaee9000000170aaeeaffffff170aaee5000000170aaee6ffffff170aaed9000000170aaedaffffff170aaed5000000170aaed6ffffff170aae93000000170aae95ffffff170aae8e000000170aae90ffffff170aa57e000000170aae81ffffff170aa56f000000170aa571ffffff170aa56a000000170aa56cffffff170aa529000000170aa52affffff170aa525000000170aa526ffffff170aa519000000170aa51affffff170aa515000000170aa516ffffff170aa4d3000000170aa4d5ffffff170aa4ce000000170aa4d0ffffff170aa43e000000170aa4c1ffffff170aa42f000000170aa431ffffff170aa42a000000170aa42cffffff170aa3e9000000170aa3eaffffff170aa3e5000000170aa3e6ffffff170aa3d9000000170aa3daffffff170aa3d5000000170aa3d6ffffff170aa393000000170aa395ffffff170aa38e000000170aa390ffffff170aa17e000000170aa381ffffff170aa16f000000170aa171ffffff170aa16a000000170aa16cffffff170aa129000000170aa12affffff170aa125000000170aa126ffffff170aa119000000170aa11affffff170aa115000000170aa116ffffff170aa0d3000000170aa0d5ffffff170aa0ce000000170aa0d0ffffff170aa07e000000170aa0c1ffffff170aa06f000000170aa071ffffff170aa06a000000170aa06cffffff170aa02a000000170aa02affffff18d57f5400000018d57f57ffffff18d57e4c00000018d57e57ffffff18d57e3800000018d57e43ffffff18d57cf800000018d57e07ffffff18d57cbc00000018d57cc7ffffff18d57ca800000018d57cb3ffffff18d57ba400000018d57babffffff18d57b9400000018d57b9bffffff18d57b6400000018d57b6bffffff18d57b5400000018d57b5bffffff18d57a4c00000018d57a57ffffff18d57a3800000018d57a43ffffff18d571f800000018d57a07ffffff18d571bc00000018d571c7ffffff18d571a800000018d571b3ffffff18d570a400000018d570abffffff18d5709400000018d5709bffffff18d5706400000018d5706bffffff18d5705400000018d5705bffffff18d56f4c00000018d56f57ffffff18d56f3800000018d56f43ffffff18d56cf800000018d56f07ffffff18d56cbc00000018d56cc7ffffff18d56ca800000018d56cb3ffffff18d56ba400000018d56babffffff18d56b9400000018d56b9bffffff18d56b6400000018d56b6bffffff18d56b5400000018d56b5bffffff18d56a4c00000018d56a57ffffff18d56a3800000018d56a43ffffff18d545f800000018d56a07ffffff18d545bc00000018d545c7ffffff18d545a800000018d545b3ffffff18d544a400000018d544abffffff18d5449400000018d5449bffffff18d5446400000018d5446bffffff18d5445400000018d5445bffffff18d5434c00000018d54357ffffff18d5433800000018d54343ffffff18d540f800000018d54307ffffff18d540bc00000018d540c7ffffff18d540a800000018d540b3ffffff18803fa400000018803fabffffff18803f9400000018803f9bffffff18803f6400000018803f6bffffff18803f5400000018803f5bffffff18803e4c00000018803e57ffffff18803e3800000018803e43ffffff188031f800000018803e07ffffff188031bc000000188031c7ffffff188031a8000000188031b3ffffff188030a4000000188030abffffff188030940000001880309bffffff188030640000001880306bffffff188030540000001880305bffffff18802f4c00000018802f57ffffff18802f3800000018802f43ffffff18802cf800000018802f07ffffff18802cbc00000018802cc7ffffff18802ca800000018802cb3ffffff18802ba400000018802babffffff18802b9400000018802b9bffffff18802b6400000018802b6bffffff18802b5400000018802b5bffffff188029fc00000018802a03ffffff188029e8000000188029f3ffffff1880290c00000018802917ffffff188026fc00000018802903ffffff188026e8000000188026f3ffffff1880260c00000018802617ffffff188025fc00000018802603ffffff188025e8000000188025f3ffffff1880250c00000018802517ffffff18801afc00000018802503ffffff18801ae800000018801af3ffffff18801a0c00000018801a17ffffff188019fc00000018801a03ffffff188019e8000000188019f3ffffff1880190c00000018801917ffffff188016fc00000018801903ffffff188016e8000000188016f3ffffff1880160c00000018801617ffffff188015fc00000018801603ffffff188015e8000000188015f3ffffff1880150c00000018801517ffffff187feafc00000018801503ffffff187feae8000000187feaf3ffffff187fea0c000000187fea17ffffff187fe9fc000000187fea03ffffff187fe9e8000000187fe9f3ffffff187fe90c000000187fe917ffffff187fe6fc000000187fe903ffffff187fe6e8000000187fe6f3ffffff187fe60c000000187fe617ffffff187fe5fc000000187fe603ffffff187fe5e8000000187fe5f3ffffff187fe50c000000187fe517ffffff187fdafc000000187fe503ffffff187fdae8000000187fdaf3ffffff187fda0c000000187fda17ffffff187fd9fc000000187fda03ffffff187fd9e8000000187fd9f3ffffff187fd90c000000187fd917ffffff187fd6fc000000187fd903ffffff187fd6e8000000187fd6f3ffffff187fd60c000000187fd617ffffff187fd5fc000000187fd603ffffff187fd4a4000000187fd4abffffff187fd494000000187fd49bffffff187fd464000000187fd46bffffff187fd454000000187fd45bffffff187fd34c000000187fd357ffffff187fd338000000187fd343ffffff187fd0f8000000187fd307ffffff187fd0bc000000187fd0c7ffffff187fd0a8000000187fd0b3ffffff187fcfa4000000187fcfabffffff187fcf94000000187fcf9bffffff187fcf64000000187fcf6bffffff187fcf54000000187fcf5bffffff187fce4c000000187fce57ffffff187fce38000000187fce43ffffff187fc1f8000000187fce07ffffff187fc1bc000000187fc1c7ffffff187fc1a8000000187fc1b3ffffff187fc0a4000000187fc0abffffff187fc094000000187fc09bffffff187fc064000000187fc06bffffff187fc054000000187fc05bffffff182abf4c000000182abf57ffffff182abf38000000182abf43ffffff182abcf8000000182abf07ffffff182abcbc000000182abcc7ffffff182abca8000000182abcb3ffffff182abba4000000182abbabffffff182abb94000000182abb9bffffff182abb64000000182abb6bffffff182abb54000000182abb5bffffff182aba4c000000182aba57ffffff182aba38000000182aba43ffffff182a95f8000000182aba07ffffff182a95bc000000182a95c7ffffff182a95a8000000182a95b3ffffff182a94a4000000182a94abffffff182a9494000000182a949bffffff182a9464000000182a946bffffff182a9454000000182a945bffffff182a934c000000182a9357ffffff182a9338000000182a9343ffffff182a90f8000000182a9307ffffff182a90bc000000182a90c7ffffff182a90a8000000182a90b3ffffff182a8fa4000000182a8fabffffff182a8f94000000182a8f9bffffff182a8f64000000182a8f6bffffff182a8f54000000182a8f5bffffff182a8e4c000000182a8e57ffffff182a8e38000000182a8e43ffffff182a85f8000000182a8e07ffffff182a85bc000000182a85c7ffffff182a85a8000000182a85b3ffffff182a84a4000000182a84abffffff182a8494000000182a849bffffff182a8464000000182a846bffffff182a8454000000182a845bffffff182a834c000000182a8357ffffff182a8338000000182a8343ffffff182a81f8000000182a8307ffffff182a81bc000000182a81c7ffffff182a81a8000000182a81b3ffffff182a80a8000000182a80abffffff190355fd50000000190355fd5fffffff190355f930000000190355f95fffffff190355f8e0000000190355f90fffffff190355f3e0000000190355f81fffffff190355f2f0000000190355f31fffffff190355f2a0000000190355f2cfffffff190355ee90000000190355eeafffffff190355ee50000000190355ee6fffffff190355ed90000000190355edafffffff190355ed50000000190355ed6fffffff190355e930000000190355e95fffffff190355e8e0000000190355e90fffffff190355c7e0000000190355e81fffffff190355c6f0000000190355c71fffffff190355c6a0000000190355c6cfffffff190355c290000000190355c2afffffff190355c250000000190355c26fffffff190355c190000000190355c1afffffff190355c150000000190355c16fffffff190355bd30000000190355bd5fffffff190355bce0000000190355bd0fffffff190355b3e0000000190355bc1fffffff190355b2f0000000190355b31fffffff190355b2a0000000190355b2cfffffff190355ae90000000190355aeafffffff190355ae50000000190355ae6fffffff190355ad90000000190355adafffffff190355ad50000000190355ad6fffffff190355a930000000190355a95fffffff190355a8e0000000190355a90fffffff19035517e0000000190355a81fffffff19035516f0000000190355171fffffff19035516a000000019035516cfffffff190355129000000019035512afffffff1903551250000000190355126fffffff190355119000000019035511afffffff1903551150000000190355116fffffff1903550d300000001903550d5fffffff1903550ce00000001903550d0fffffff19035503e00000001903550c1fffffff19035502f0000000190355031fffffff19035502a000000019035502cfffffff190200fe90000000190200feafffffff190200fe50000000190200fe6fffffff190200fd90000000190200fdafffffff190200fd50000000190200fd6fffffff190200f930000000190200f95fffffff190200f8e0000000190200f90fffffff190200c7e0000000190200f81fffffff190200c6f0000000190200c71fffffff190200c6a0000000190200c6cfffffff190200c290000000190200c2afffffff190200c250000000190200c26fffffff190200c190000000190200c1afffffff190200c150000000190200c16fffffff190200bd30000000190200bd5fffffff190200bce0000000190200bd0fffffff190200b3e0000000190200bc1fffffff190200b2f0000000190200b31fffffff190200b2a0000000190200b2cfffffff190200ae90000000190200aeafffffff190200ae50000000190200ae6fffffff190200ad90000000190200adafffffff190200ad50000000190200ad6fffffff190200a7f0000000190200a80fffffff190200a7a0000000190200a7cfffffff190200a430000000190200a45fffffff1902009bf0000000190200a40fffffff1902009ba00000001902009bcfffffff1902009830000000190200985fffffff19020097f0000000190200980fffffff19020097a000000019020097cfffffff1902009430000000190200945fffffff1902006bf0000000190200940fffffff1902006ba00000001902006bcfffffff1902006830000000190200685fffffff19020067f0000000190200680fffffff19020067a000000019020067cfffffff1902006430000000190200645fffffff1902005bf0000000190200640fffffff1902005ba00000001902005bcfffffff1902005830000000190200585fffffff19020057f0000000190200580fffffff19020057a000000019020057cfffffff1902005430000000190200545fffffff1901ffabf0000000190200540fffffff1901ffaba00000001901ffabcfffffff1901ffa8300000001901ffa85fffffff1901ffa7f00000001901ffa80fffffff1901ffa7a00000001901ffa7cfffffff1901ffa4300000001901ffa45fffffff1901ff9bf00000001901ffa40fffffff1901ff9ba00000001901ff9bcfffffff1901ff98300000001901ff985fffffff1901ff97f00000001901ff980fffffff1901ff97a00000001901ff97cfffffff1901ff94300000001901ff945fffffff1901ff6bf00000001901ff940fffffff1901ff6ba00000001901ff6bcfffffff1901ff68300000001901ff685fffffff1901ff67f00000001901ff680fffffff1901ff67a00000001901ff67cfffffff1901ff64300000001901ff645fffffff1901ff5bf00000001901ff640fffffff1901ff5ba00000001901ff5bcfffffff1901ff58300000001901ff585fffffff1901ff57f00000001901ff580fffffff1901ff52900000001901ff52afffffff1901ff52500000001901ff526fffffff1901ff51900000001901ff51afffffff1901ff51500000001901ff516fffffff1901ff4d300000001901ff4d5fffffff1901ff4ce00000001901ff4d0fffffff1901ff43e00000001901ff4c1fffffff1901ff42f00000001901ff431fffffff1901ff42a00000001901ff42cfffffff1901ff3e900000001901ff3eafffffff1901ff3e500000001901ff3e6fffffff1901ff3d900000001901ff3dafffffff1901ff3d500000001901ff3d6fffffff1901ff39300000001901ff395fffffff1901ff38e00000001901ff390fffffff1901ff07e00000001901ff381fffffff1901ff06f00000001901ff071fffffff1901ff06a00000001901ff06cfffffff1901ff02900000001901ff02afffffff1901ff02500000001901ff026fffffff1901ff01900000001901ff01afffffff1901ff01500000001901ff016fffffff1900aafd300000001900aafd5fffffff1900aafce00000001900aafd0fffffff1900aaf3e00000001900aafc1fffffff1900aaf2f00000001900aaf31fffffff1900aaf2a00000001900aaf2cfffffff1900aaee900000001900aaeeafffffff1900aaee500000001900aaee6fffffff1900aaed900000001900aaedafffffff1900aaed500000001900aaed6fffffff1900aae9300000001900aae95fffffff1900aae8e00000001900aae90fffffff1900aa57e00000001900aae81fffffff1900aa56f00000001900aa571fffffff1900aa56a00000001900aa56cfffffff1900aa52900000001900aa52afffffff1900aa52500000001900aa526fffffff1900aa51900000001900aa51afffffff1900aa51500000001900aa516fffffff1900aa4d300000001900aa4d5fffffff1900aa4ce00000001900aa4d0fffffff1900aa43e00000001900aa4c1fffffff1900aa42f00000001900aa431fffffff1900aa42a00000001900aa42cfffffff1900aa3e900000001900aa3eafffffff1900aa3e500000001900aa3e6fffffff1900aa3d900000001900aa3dafffffff1900aa3d500000001900aa3d6fffffff1900aa39300000001900aa395fffffff1900aa38e00000001900aa390fffffff1900aa17e00000001900aa381fffffff1900aa16f00000001900aa171fffffff1900aa16a00000001900aa16cfffffff1900aa12900000001900aa12afffffff1900aa12500000001900aa126fffffff1900aa11900000001900aa11afffffff1900aa11500000001900aa116fffffff1900aa0d300000001900aa0d5fffffff1900aa0ce00000001900aa0d0fffffff1900aa07e00000001900aa0c1fffffff1900aa06f00000001900aa071fffffff1900aa06a00000001900aa06cfffffff1900aa02a00000001900aa02afffffff1a0d57f5400000001a0d57f57fffffff1a0d57e4c00000001a0d57e57fffffff1a0d57e3800000001a0d57e43fffffff1a0d57cf800000001a0d57e07fffffff1a0d57cbc00000001a0d57cc7fffffff1a0d57ca800000001a0d57cb3fffffff1a0d57ba400000001a0d57babfffffff1a0d57b9400000001a0d57b9bfffffff1a0d57b6400000001a0d57b6bfffffff1a0d57b5400000001a0d57b5bfffffff1a0d57a4c00000001a0d57a57fffffff1a0d57a3800000001a0d57a43fffffff1a0d571f800000001a0d57a07fffffff1a0d571bc00000001a0d571c7fffffff1a0d571a800000001a0d571b3fffffff1a0d570a400000001a0d570abfffffff1a0d5709400000001a0d5709bfffffff1a0d5706400000001a0d5706bfffffff1a0d5705400000001a0d5705bfffffff1a0d56f4c00000001a0d56f57fffffff1a0d56f3800000001a0d56f43fffffff1a0d56cf800000001a0d56f07fffffff1a0d56cbc00000001a0d56cc7fffffff1a0d56ca800000001a0d56cb3fffffff1a0d56ba400000001a0d56babfffffff1a0d56b9400000001a0d56b9bfffffff1a0d56b6400000001a0d56b6bfffffff1a0d56b5400000001a0d56b5bfffffff1a0d56a4c00000001a0d56a57fffffff1a0d56a3800000001a0d56a43fffffff1a0d545f800000001a0d56a07fffffff1a0d545bc00000001a0d545c7fffffff1a0d545a800000001a0d545b3fffffff1a0d544a400000001a0d544abfffffff1a0d5449400000001a0d5449bfffffff1a0d5446400000001a0d5446bfffffff1a0d5445400000001a0d5445bfffffff1a0d5434c00000001a0d54357fffffff1a0d5433800000001a0d54343fffffff1a0d540f800000001a0d54307fffffff1a0d540bc00000001a0d540c7fffffff1a0d540a800000001a0d540b3fffffff1a0803fa400000001a0803fabfffffff1a0803f9400000001a0803f9bfffffff1a0803f6400000001a0803f6bfffffff1a0803f5400000001a0803f5bfffffff1a0803e4c00000001a0803e57fffffff1a0803e3800000001a0803e43fffffff1a08031f800000001a0803e07fffffff1a08031bc00000001a08031c7fffffff1a08031a800000001a08031b3fffffff1a08030a400000001a08030abfffffff1a080309400000001a080309bfffffff1a080306400000001a080306bfffffff1a080305400000001a080305bfffffff1a0802f4c00000001a0802f57fffffff1a0802f3800000001a0802f43fffffff1a0802cf800000001a0802f07fffffff1a0802cbc00000001a0802cc7fffffff1a0802ca800000001a0802cb3fffffff1a0802ba400000001a0802babfffffff1a0802b9400000001a0802b9bfffffff1a0802b6400000001a0802b6bfffffff1a0802b5400000001a0802b5bfffffff1a08029fc00000001a0802a03fffffff1a08029e800000001a08029f3fffffff1a080290c00000001a0802917fffffff1a08026fc00000001a0802903fffffff1a08026e800000001a08026f3fffffff1a080260c00000001a0802617fffffff1a08025fc00000001a0802603fffffff1a08025e800000001a08025f3fffffff1a080250c00000001a0802517fffffff1a0801afc00000001a0802503fffffff1a0801ae800000001a0801af3fffffff1a0801a0c00000001a0801a17fffffff1a08019fc00000001a0801a03fffffff1a08019e800000001a08019f3fffffff1a080190c00000001a0801917fffffff1a08016fc00000001a0801903fffffff1a08016e800000001a08016f3fffffff1a080160c00000001a0801617fffffff1a08015fc00000001a0801603fffffff1a08015e800000001a08015f3fffffff1a080150c00000001a0801517fffffff1a07feafc00000001a0801503fffffff1a07feae800000001a07feaf3fffffff1a07fea0c00000001a07fea17fffffff1a07fe9fc00000001a07fea03fffffff1a07fe9e800000001a07fe9f3fffffff1a07fe90c00000001a07fe917fffffff1a07fe6fc00000001a07fe903fffffff1a07fe6e800000001a07fe6f3fffffff1a07fe60c00000001a07fe617fffffff1a07fe5fc00000001a07fe603fffffff1a07fe5e800000001a07fe5f3fffffff1a07fe50c00000001a07fe517fffffff1a07fdafc00000001a07fe503fffffff1a07fdae800000001a07fdaf3fffffff1a07fda0c00000001a07fda17fffffff1a07fd9fc00000001a07fda03fffffff1a07fd9e800000001a07fd9f3fffffff1a07fd90c00000001a07fd917fffffff1a07fd6fc00000001a07fd903fffffff1a07fd6e800000001a07fd6f3fffffff1a07fd60c00000001a07fd617fffffff1a07fd5fc00000001a07fd603fffffff1a07fd4a400000001a07fd4abfffffff1a07fd49400000001a07fd49bfffffff1a07fd46400000001a07fd46bfffffff1a07fd45400000001a07fd45bfffffff1a07fd34c00000001a07fd357fffffff1a07fd33800000001a07fd343fffffff1a07fd0f800000001a07fd307fffffff1a07fd0bc00000001a07fd0c7fffffff1a07fd0a800000001a07fd0b3fffffff1a07fcfa400000001a07fcfabfffffff1a07fcf9400000001a07fcf9bfffffff1a07fcf6400000001a07fcf6bfffffff1a07fcf5400000001a07fcf5bfffffff1a07fce4c00000001a07fce57fffffff1a07fce3800000001a07fce43fffffff1a07fc1f800000001a07fce07fffffff1a07fc1bc00000001a07fc1c7fffffff1a07fc1a800000001a07fc1b3fffffff1a07fc0a400000001a07fc0abfffffff1a07fc09400000001a07fc09bfffffff1a07fc06400000001a07fc06bfffffff1a07fc05400000001a07fc05bfffffff1a02abf4c00000001a02abf57fffffff1a02abf3800000001a02abf43fffffff1a02abcf800000001a02abf07fffffff1a02abcbc00000001a02abcc7fffffff1a02abca800000001a02abcb3fffffff1a02abba400000001a02abbabfffffff1a02abb9400000001a02abb9bfffffff1a02abb6400000001a02abb6bfffffff1a02abb5400000001a02abb5bfffffff1a02aba4c00000001a02aba57fffffff1a02aba3800000001a02aba43fffffff1a02a95f800000001a02aba07fffffff1a02a95bc00000001a02a95c7fffffff1a02a95a800000001a02a95b3fffffff1a02a94a400000001a02a94abfffffff1a02a949400000001a02a949bfffffff1a02a946400000001a02a946bfffffff1a02a945400000001a02a945bfffffff1a02a934c00000001a02a9357fffffff1a02a933800000001a02a9343fffffff1a02a90f800000001a02a9307fffffff1a02a90bc00000001a02a90c7fffffff1a02a90a800000001a02a90b3fffffff1a02a8fa400000001a02a8fabfffffff1a02a8f9400000001a02a8f9bfffffff1a02a8f6400000001a02a8f6bfffffff1a02a8f5400000001a02a8f5bfffffff1a02a8e4c00000001a02a8e57fffffff1a02a8e3800000001a02a8e43fffffff1a02a85f800000001a02a8e07fffffff1a02a85bc00000001a02a85c7fffffff1a02a85a800000001a02a85b3fffffff1a02a84a400000001a02a84abfffffff1a02a849400000001a02a849bfffffff1a02a846400000001a02a846bfffffff1a02a845400000001a02a845bfffffff1a02a834c00000001a02a8357fffffff1a02a833800000001a02a8343fffffff1a02a81f800000001a02a8307fffffff1a02a81bc00000001a02a81c7fffffff1a02a81a800000001a02a81b3fffffff1a02a80a800000001a02a80abfffffff1b355fd5000000001b355fd5ffffffff1b355f93000000001b355f95ffffffff1b355f8e000000001b355f90ffffffff1b355f3e000000001b355f81ffffffff1b355f2f000000001b355f31ffffffff1b355f2a000000001b355f2cffffffff1b355ee9000000001b355eeaffffffff1b355ee5000000001b355ee6ffffffff1b355ed9000000001b355edaffffffff1b355ed5000000001b355ed6ffffffff1b355e93000000001b355e95ffffffff1b355e8e000000001b355e90ffffffff1b355c7e000000001b355e81ffffffff1b355c6f000000001b355c71ffffffff1b355c6a000000001b355c6cffffffff1b355c29000000001b355c2affffffff1b355c25000000001b355c26ffffffff1b355c19000000001b355c1affffffff1b355c15000000001b355c16ffffffff1b355bd3000000001b355bd5ffffffff1b355bce000000001b355bd0ffffffff1b355b3e000000001b355bc1ffffffff1b355b2f000000001b355b31ffffffff1b355b2a000000001b355b2cffffffff1b355ae9000000001b355aeaffffffff1b355ae5000000001b355ae6ffffffff1b355ad9000000001b355adaffffffff1b355ad5000000001b355ad6ffffffff1b355a93000000001b355a95ffffffff1b355a8e000000001b355a90ffffffff1b35517e000000001b355a81ffffffff1b35516f000000001b355171ffffffff1b35516a000000001b35516cffffffff1b355129000000001b35512affffffff1b355125000000001b355126ffffffff1b355119000000001b35511affffffff1b355115000000001b355116ffffffff1b3550d3000000001b3550d5ffffffff1b3550ce000000001b3550d0ffffffff1b35503e000000001b3550c1ffffffff1b35502f000000001b355031ffffffff1b35502a000000001b35502cffffffff1b200fe9000000001b200feaffffffff1b200fe5000000001b200fe6ffffffff1b200fd9000000001b200fdaffffffff1b200fd5000000001b200fd6ffffffff1b200f93000000001b200f95ffffffff1b200f8e000000001b200f90ffffffff1b200c7e000000001b200f81ffffffff1b200c6f000000001b200c71ffffffff1b200c6a000000001b200c6cffffffff1b200c29000000001b200c2affffffff1b200c25000000001b200c26ffffffff1b200c19000000001b200c1affffffff1b200c15000000001b200c16ffffffff1b200bd3000000001b200bd5ffffffff1b200bce000000001b200bd0ffffffff1b200b3e000000001b200bc1ffffffff1b200b2f000000001b200b31ffffffff1b200b2a000000001b200b2cffffffff1b200ae9000000001b200aeaffffffff1b200ae5000000001b200ae6ffffffff1b200ad9000000001b200adaffffffff1b200ad5000000001b200ad6ffffffff1b200a7f000000001b200a80ffffffff1b200a7a000000001b200a7cffffffff1b200a43000000001b200a45ffffffff1b2009bf000000001b200a40ffffffff1b2009ba000000001b2009bcffffffff1b200983000000001b200985ffffffff1b20097f000000001b200980ffffffff1b20097a000000001b20097cffffffff1b200943000000001b200945ffffffff1b2006bf000000001b200940ffffffff1b2006ba000000001b2006bcffffffff1b200683000000001b200685ffffffff1b20067f000000001b200680ffffffff1b20067a000000001b20067cffffffff1b200643000000001b200645ffffffff1b2005bf000000001b200640ffffffff1b2005ba000000001b2005bcffffffff1b200583000000001b200585ffffffff1b20057f000000001b200580ffffffff1b20057a000000001b20057cffffffff1b200543000000001b200545ffffffff1b1ffabf000000001b200540ffffffff1b1ffaba000000001b1ffabcffffffff1b1ffa83000000001b1ffa85ffffffff1b1ffa7f000000001b1ffa80ffffffff1b1ffa7a000000001b1ffa7cffffffff1b1ffa43000000001b1ffa45ffffffff1b1ff9bf000000001b1ffa40ffffffff1b1ff9ba000000001b1ff9bcffffffff1b1ff983000000001b1ff985ffffffff1b1ff97f000000001b1ff980ffffffff1b1ff97a000000001b1ff97cffffffff1b1ff943000000001b1ff945ffffffff1b1ff6bf000000001b1ff940ffffffff1b1ff6ba000000001b1ff6bcffffffff1b1ff683000000001b1ff685ffffffff1b1ff67f000000001b1ff680ffffffff1b1ff67a000000001b1ff67cffffffff1b1ff643000000001b1ff645ffffffff1b1ff5bf000000001b1ff640ffffffff1b1ff5ba000000001b1ff5bcffffffff1b1ff583000000001b1ff585ffffffff1b1ff57f000000001b1ff580ffffffff1b1ff529000000001b1ff52affffffff1b1ff525000000001b1ff526ffffffff1b1ff519000000001b1ff51affffffff1b1ff515000000001b1ff516ffffffff1b1ff4d3000000001b1ff4d5ffffffff1b1ff4ce000000001b1ff4d0ffffffff1b1ff43e000000001b1ff4c1ffffffff1b1ff42f000000001b1ff431ffffffff1b1ff42a000000001b1ff42cffffffff1b1ff3e9000000001b1ff3eaffffffff1b1ff3e5000000001b1ff3e6ffffffff1b1ff3d9000000001b1ff3daffffffff1b1ff3d5000000001b1ff3d6ffffffff1b1ff393000000001b1ff395ffffffff1b1ff38e000000001b1ff390ffffffff1b1ff07e000000001b1ff381ffffffff1b1ff06f000000001b1ff071ffffffff1b1ff06a000000001b1ff06cffffffff1b1ff029000000001b1ff02affffffff1b1ff025000000001b1ff026ffffffff1b1ff019000000001b1ff01affffffff1b1ff015000000001b1ff016ffffffff1b0aafd3000000001b0aafd5ffffffff1b0aafce000000001b0aafd0ffffffff1b0aaf3e000000001b0aafc1ffffffff1b0aaf2f000000001b0aaf31ffffffff1b0aaf2a000000001b0aaf2cffffffff1b0aaee9000000001b0aaeeaffffffff1b0aaee5000000001b0aaee6ffffffff1b0aaed9000000001b0aaedaffffffff1b0aaed5000000001b0aaed6ffffffff1b0aae93000000001b0aae95ffffffff1b0aae8e000000001b0aae90ffffffff1b0aa57e000000001b0aae81ffffffff1b0aa56f000000001b0aa571ffffffff1b0aa56a000000001b0aa56cffffffff1b0aa529000000001b0aa52affffffff1b0aa525000000001b0aa526ffffffff1b0aa519000000001b0aa51affffffff1b0aa515000000001b0aa516ffffffff1b0aa4d3000000001b0aa4d5ffffffff1b0aa4ce000000001b0aa4d0ffffffff1b0aa43e000000001b0aa4c1ffffffff1b0aa42f000000001b0aa431ffffffff1b0aa42a000000001b0aa42cffffffff1b0aa3e9000000001b0aa3eaffffffff1b0aa3e5000000001b0aa3e6ffffffff1b0aa3d9000000001b0aa3daffffffff1b0aa3d5000000001b0aa3d6ffffffff1b0aa393000000001b0aa395ffffffff1b0aa38e000000001b0aa390ffffffff1b0aa17e000000001b0aa381ffffffff1b0aa16f000000001b0aa171ffffffff1b0aa16a000000001b0aa16cffffffff1b0aa129000000001b0aa12affffffff1b0aa125000000001b0aa126ffffffff1b0aa119000000001b0aa11affffffff1b0aa115000000001b0aa116ffffffff1b0aa0d3000000001b0aa0d5ffffffff1b0aa0ce000000001b0aa0d0ffffffff1b0aa07e000000001b0aa0c1ffffffff1b0aa06f000000001b0aa071ffffffff1b0aa06a000000001b0aa06cffffffff1b0aa02a000000001b0aa02affffffff1cd57f54000000001cd57f57ffffffff1cd57e4c000000001cd57e57ffffffff1cd57e38000000001cd57e43ffffffff1cd57cf8000000001cd57e07ffffffff1cd57cbc000000001cd57cc7ffffffff1cd57ca8000000001cd57cb3ffffffff1cd57ba4000000001cd57babffffffff1cd57b94000000001cd57b9bffffffff1cd57b64000000001cd57b6bffffffff1cd57b54000000001cd57b5bffffffff1cd57a4c000000001cd57a57ffffffff1cd57a38000000001cd57a43ffffffff1cd571f8000000001cd57a07ffffffff1cd571bc000000001cd571c7ffffffff1cd571a8000000001cd571b3ffffffff1cd570a4000000001cd570abffffffff1cd57094000000001cd5709bffffffff1cd57064000000001cd5706bffffffff1cd57054000000001cd5705bffffffff1cd56f4c000000001cd56f57ffffffff1cd56f38000000001cd56f43ffffffff1cd56cf8000000001cd56f07ffffffff1cd56cbc000000001cd56cc7ffffffff1cd56ca8000000001cd56cb3ffffffff1cd56ba4000000001cd56babffffffff1cd56b94000000001cd56b9bffffffff1cd56b64000000001cd56b6bffffffff1cd56b54000000001cd56b5bffffffff1cd56a4c000000001cd56a57ffffffff1cd56a38000000001cd56a43ffffffff1cd545f8000000001cd56a07ffffffff1cd545bc000000001cd545c7ffffffff1cd545a8000000001cd545b3ffffffff1cd544a4000000001cd544abffffffff1cd54494000000001cd5449bffffffff1cd54464000000001cd5446bffffffff1cd54454000000001cd5445bffffffff1cd5434c000000001cd54357ffffffff1cd54338000000001cd54343ffffffff1cd540f8000000001cd54307ffffffff1cd540bc000000001cd540c7ffffffff1cd540a8000000001cd540b3ffffffff1c803fa4000000001c803fabffffffff1c803f94000000001c803f9bffffffff1c803f64000000001c803f6bffffffff1c803f54000000001c803f5bffffffff1c803e4c000000001c803e57ffffffff1c803e38000000001c803e43ffffffff1c8031f8000000001c803e07ffffffff1c8031bc000000001c8031c7ffffffff1c8031a8000000001c8031b3ffffffff1c8030a4000000001c8030abffffffff1c803094000000001c80309bffffffff1c803064000000001c80306bffffffff1c803054000000001c80305bffffffff1c802f4c000000001c802f57ffffffff1c802f38000000001c802f43ffffffff1c802cf8000000001c802f07ffffffff1c802cbc000000001c802cc7ffffffff1c802ca8000000001c802cb3ffffffff1c802ba4000000001c802babffffffff1c802b94000000001c802b9bffffffff1c802b64000000001c802b6bffffffff1c802b54000000001c802b5bffffffff1c8029fc000000001c802a03ffffffff1c8029e8000000001c8029f3ffffffff1c80290c000000001c802917ffffffff1c8026fc000000001c802903ffffffff1c8026e8000000001c8026f3ffffffff1c80260c000000001c802617ffffffff1c8025fc000000001c802603ffffffff1c8025e8000000001c8025f3ffffffff1c80250c000000001c802517ffffffff1c801afc000000001c802503ffffffff1c801ae8000000001c801af3ffffffff1c801a0c000000001c801a17ffffffff1c8019fc000000001c801a03ffffffff1c8019e8000000001c8019f3ffffffff1c80190c000000001c801917ffffffff1c8016fc000000001c801903ffffffff1c8016e8000000001c8016f3ffffffff1c80160c000000001c801617ffffffff1c8015fc000000001c801603ffffffff1c8015e8000000001c8015f3ffffffff1c80150c000000001c801517ffffffff1c7feafc000000001c801503ffffffff1c7feae8000000001c7feaf3ffffffff1c7fea0c000000001c7fea17ffffffff1c7fe9fc000000001c7fea03ffffffff1c7fe9e8000000001c7fe9f3ffffffff1c7fe90c000000001c7fe917ffffffff1c7fe6fc000000001c7fe903ffffffff1c7fe6e8000000001c7fe6f3ffffffff1c7fe60c000000001c7fe617ffffffff1c7fe5fc000000001c7fe603ffffffff1c7fe5e8000000001c7fe5f3ffffffff1c7fe50c000000001c7fe517ffffffff1c7fdafc000000001c7fe503ffffffff1c7fdae8000000001c7fdaf3ffffffff1c7fda0c000000001c7fda17ffffffff1c7fd9fc000000001c7fda03ffffffff1c7fd9e8000000001c7fd9f3ffffffff1c7fd90c000000001c7fd917ffffffff1c7fd6fc000000001c7fd903ffffffff1c7fd6e8000000001c7fd6f3ffffffff1c7fd60c000000001c7fd617ffffffff1c7fd5fc000000001c7fd603ffffffff1c7fd4a4000000001c7fd4abffffffff1c7fd494000000001c7fd49bffffffff1c7fd464000000001c7fd46bffffffff1c7fd454000000001c7fd45bffffffff1c7fd34c000000001c7fd357ffffffff1c7fd338000000001c7fd343ffffffff1c7fd0f8000000001c7fd307ffffffff1c7fd0bc000000001c7fd0c7ffffffff1c7fd0a8000000001c7fd0b3ffffffff1c7fcfa4000000001c7fcfabffffffff1c7fcf94000000001c7fcf9bffffffff1c7fcf64000000001c7fcf6bffffffff1c7fcf54000000001c7fcf5bffffffff1c7fce4c000000001c7fce57ffffffff1c7fce38000000001c7fce43ffffffff1c7fc1f8000000001c7fce07ffffffff1c7fc1bc000000001c7fc1c7ffffffff1c7fc1a8000000001c7fc1b3ffffffff1c7fc0a4000000001c7fc0abffffffff1c7fc094000000001c7fc09bffffffff1c7fc064000000001c7fc06bffffffff1c7fc054000000001c7fc05bffffffff1c2abf4c000000001c2abf57ffffffff1c2abf38000000001c2abf43ffffffff1c2abcf8000000001c2abf07ffffffff1c2abcbc000000001c2abcc7ffffffff1c2abca8000000001c2abcb3ffffffff1c2abba4000000001c2abbabffffffff1c2abb94000000001c2abb9bffffffff1c2abb64000000001c2abb6bffffffff1c2abb54000000001c2abb5bffffffff1c2aba4c000000001c2aba57ffffffff1c2aba38000000001c2aba43ffffffff1c2a95f8000000001c2aba07ffffffff1c2a95bc000000001c2a95c7ffffffff1c2a95a8000000001c2a95b3ffffffff1c2a94a4000000001c2a94abffffffff1c2a9494000000001c2a949bffffffff1c2a9464000000001c2a946bffffffff1c2a9454000000001c2a945bffffffff1c2a934c000000001c2a9357ffffffff1c2a9338000000001c2a9343ffffffff1c2a90f8000000001c2a9307ffffffff1c2a90bc000000001c2a90c7ffffffff1c2a90a8000000001c2a90b3ffffffff1c2a8fa4000000001c2a8fabffffffff1c2a8f94000000001c2a8f9bffffffff1c2a8f64000000001c2a8f6bffffffff1c2a8f54000000001c2a8f5bffffffff1c2a8e4c000000001c2a8e57ffffffff1c2a8e38000000001c2a8e43ffffffff1c2a85f8000000001c2a8e07ffffffff1c2a85bc000000001c2a85c7ffffffff1c2a85a8000000001c2a85b3ffffffff1c2a84a4000000001c2a84abffffffff1c2a8494000000001c2a849bffffffff1c2a8464000000001c2a846bffffffff1c2a8454000000001c2a845bffffffff1c2a834c000000001c2a8357ffffffff1c2a8338000000001c2a8343ffffffff1c2a81f8000000001c2a8307ffffffff1c2a81bc000000001c2a81c7ffffffff1c2a81a8000000001c2a81b3ffffffff1c2a80a8000000001c2a80abffffffff1d0355fd50000000001d0355fd5fffffffff1d0355f930000000001d0355f95fffffffff1d0355f8e0000000001d0355f90fffffffff1d0355f3e0000000001d0355f81fffffffff1d0355f2f0000000001d0355f31fffffffff1d0355f2a0000000001d0355f2cfffffffff1d0355ee90000000001d0355eeafffffffff1d0355ee50000000001d0355ee6fffffffff1d0355ed90000000001d0355edafffffffff1d0355ed50000000001d0355ed6fffffffff1d0355e930000000001d0355e95fffffffff1d0355e8e0000000001d0355e90fffffffff1d0355c7e0000000001d0355e81fffffffff1d0355c6f0000000001d0355c71fffffffff1d0355c6a0000000001d0355c6cfffffffff1d0355c290000000001d0355c2afffffffff1d0355c250000000001d0355c26fffffffff1d0355c190000000001d0355c1afffffffff1d0355c150000000001d0355c16fffffffff1d0355bd30000000001d0355bd5fffffffff1d0355bce0000000001d0355bd0fffffffff1d0355b3e0000000001d0355bc1fffffffff1d0355b2f0000000001d0355b31fffffffff1d0355b2a0000000001d0355b2cfffffffff1d0355ae90000000001d0355aeafffffffff1d0355ae50000000001d0355ae6fffffffff1d0355ad90000000001d0355adafffffffff1d0355ad50000000001d0355ad6fffffffff1d0355a930000000001d0355a95fffffffff1d0355a8e0000000001d0355a90fffffffff1d035517e0000000001d0355a81fffffffff1d035516f0000000001d0355171fffffffff1d035516a0000000001d035516cfffffffff1d03551290000000001d035512afffffffff1d03551250000000001d0355126fffffffff1d03551190000000001d035511afffffffff1d03551150000000001d0355116fffffffff1d03550d30000000001d03550d5fffffffff1d03550ce0000000001d03550d0fffffffff1d035503e0000000001d03550c1fffffffff1d035502f0000000001d0355031fffffffff1d035502a0000000001d035502cfffffffff1d0200fe90000000001d0200feafffffffff1d0200fe50000000001d0200fe6fffffffff1d0200fd90000000001d0200fdafffffffff1d0200fd50000000001d0200fd6fffffffff1d0200f930000000001d0200f95fffffffff1d0200f8e0000000001d0200f90fffffffff1d0200c7e0000000001d0200f81fffffffff1d0200c6f0000000001d0200c71fffffffff1d0200c6a0000000001d0200c6cfffffffff1d0200c290000000001d0200c2afffffffff1d0200c250000000001d0200c26fffffffff1d0200c190000000001d0200c1afffffffff1d0200c150000000001d0200c16fffffffff1d0200bd30000000001d0200bd5fffffffff1d0200bce0000000001d0200bd0fffffffff1d0200b3e0000000001d0200bc1fffffffff1d0200b2f0000000001d0200b31fffffffff1d0200b2a0000000001d0200b2cfffffffff1d0200ae90000000001d0200aeafffffffff1d0200ae50000000001d0200ae6fffffffff1d0200ad90000000001d0200adafffffffff1d0200ad50000000001d0200ad6fffffffff1d0200a7f0000000001d0200a80fffffffff1d0200a7a0000000001d0200a7cfffffffff1d0200a430000000001d0200a45fffffffff1d02009bf0000000001d0200a40fffffffff1d02009ba0000000001d02009bcfffffffff1d02009830000000001d0200985fffffffff1d020097f0000000001d0200980fffffffff1d020097a0000000001d020097cfffffffff1d02009430000000001d0200945fffffffff1d02006bf0000000001d0200940fffffffff1d02006ba0000000001d02006bcfffffffff1d02006830000000001d0200685fffffffff1d020067f0000000001d0200680fffffffff1d020067a0000000001d020067cfffffffff1d02006430000000001d0200645fffffffff1d02005bf0000000001d0200640fffffffff1d02005ba0000000001d02005bcfffffffff1d02005830000000001d0200585fffffffff1d020057f0000000001d0200580fffffffff1d020057a0000000001d020057cfffffffff1d02005430000000001d0200545fffffffff1d01ffabf0000000001d0200540fffffffff1d01ffaba0000000001d01ffabcfffffffff1d01ffa830000000001d01ffa85fffffffff1d01ffa7f0000000001d01ffa80fffffffff1d01ffa7a0000000001d01ffa7cfffffffff1d01ffa430000000001d01ffa45fffffffff1d01ff9bf0000000001d01ffa40fffffffff1d01ff9ba0000000001d01ff9bcfffffffff1d01ff9830000000001d01ff985fffffffff1d01ff97f0000000001d01ff980fffffffff1d01ff97a0000000001d01ff97cfffffffff1d01ff9430000000001d01ff945fffffffff1d01ff6bf0000000001d01ff940fffffffff1d01ff6ba0000000001d01ff6bcfffffffff1d01ff6830000000001d01ff685fffffffff1d01ff67f0000000001d01ff680fffffffff1d01ff67a0000000001d01ff67cfffffffff1d01ff6430000000001d01ff645fffffffff1d01ff5bf0000000001d01ff640fffffffff1d01ff5ba0000000001d01ff5bcfffffffff1d01ff5830000000001d01ff585fffffffff1d01ff57f0000000001d01ff580fffffffff1d01ff5290000000001d01ff52afffffffff1d01ff5250000000001d01ff526fffffffff1d01ff5190000000001d01ff51afffffffff1d01ff5150000000001d01ff516fffffffff1d01ff4d30000000001d01ff4d5fffffffff1d01ff4ce0000000001d01ff4d0fffffffff1d01ff43e0000000001d01ff4c1fffffffff1d01ff42f0000000001d01ff431fffffffff1d01ff42a0000000001d01ff42cfffffffff1d01ff3e90000000001d01ff3eafffffffff1d01ff3e50000000001d01ff3e6fffffffff1d01ff3d90000000001d01ff3dafffffffff1d01ff3d50000000001d01ff3d6fffffffff1d01ff3930000000001d01ff395fffffffff1d01ff38e0000000001d01ff390fffffffff1d01ff07e0000000001d01ff381fffffffff1d01ff06f0000000001d01ff071fffffffff1d01ff06a0000000001d01ff06cfffffffff1d01ff0290000000001d01ff02afffffffff1d01ff0250000000001d01ff026fffffffff1d01ff0190000000001d01ff01afffffffff1d01ff0150000000001d01ff016fffffffff1d00aafd30000000001d00aafd5fffffffff1d00aafce0000000001d00aafd0fffffffff1d00aaf3e0000000001d00aafc1fffffffff1d00aaf2f0000000001d00aaf31fffffffff1d00aaf2a0000000001d00aaf2cfffffffff1d00aaee90000000001d00aaeeafffffffff1d00aaee50000000001d00aaee6fffffffff1d00aaed90000000001d00aaedafffffffff1d00aaed50000000001d00aaed6fffffffff1d00aae930000000001d00aae95fffffffff1d00aae8e0000000001d00aae90fffffffff1d00aa57e0000000001d00aae81fffffffff1d00aa56f0000000001d00aa571fffffffff1d00aa56a0000000001d00aa56cfffffffff1d00aa5290000000001d00aa52afffffffff1d00aa5250000000001d00aa526fffffffff1d00aa5190000000001d00aa51afffffffff1d00aa5150000000001d00aa516fffffffff1d00aa4d30000000001d00aa4d5fffffffff1d00aa4ce0000000001d00aa4d0fffffffff1d00aa43e0000000001d00aa4c1fffffffff1d00aa42f0000000001d00aa431fffffffff1d00aa42a0000000001d00aa42cfffffffff1d00aa3e90000000001d00aa3eafffffffff1d00aa3e50000000001d00aa3e6fffffffff1d00aa3d90000000001d00aa3dafffffffff1d00aa3d50000000001d00aa3d6fffffffff1d00aa3930000000001d00aa395fffffffff1d00aa38e0000000001d00aa390fffffffff1d00aa17e0000000001d00aa381fffffffff1d00aa16f0000000001d00aa171fffffffff1d00aa16a0000000001d00aa16cfffffffff1d00aa1290000000001d00aa12afffffffff1d00aa1250000000001d00aa126fffffffff1d00aa1190000000001d00aa11afffffffff1d00aa1150000000001d00aa116fffffffff1d00aa0d30000000001d00aa0d5fffffffff1d00aa0ce0000000001d00aa0d0fffffffff1d00aa07e0000000001d00aa0c1fffffffff1d00aa06f0000000001d00aa071fffffffff1d00aa06a0000000001d00aa06cfffffffff1d00aa02a0000000001d00aa02afffffffff1e0d57f540000000001e0d57f57fffffffff1e0d57e4c0000000001e0d57e57fffffffff1e0d57e380000000001e0d57e43fffffffff1e0d57cf80000000001e0d57e07fffffffff1e0d57cbc0000000001e0d57cc7fffffffff1e0d57ca80000000001e0d57cb3fffffffff1e0d57ba40000000001e0d57babfffffffff1e0d57b940000000001e0d57b9bfffffffff1e0d57b640000000001e0d57b6bfffffffff1e0d57b540000000001e0d57b5bfffffffff1e0d57a4c0000000001e0d57a57fffffffff1e0d57a380000000001e0d57a43fffffffff1e0d571f80000000001e0d57a07fffffffff1e0d571bc0000000001e0d571c7fffffffff1e0d571a80000000001e0d571b3fffffffff1e0d570a40000000001e0d570abfffffffff1e0d570940000000001e0d5709bfffffffff1e0d570640000000001e0d5706bfffffffff1e0d570540000000001e0d5705bfffffffff1e0d56f4c0000000001e0d56f57fffffffff1e0d56f380000000001e0d56f43fffffffff1e0d56cf80000000001e0d56f07fffffffff1e0d56cbc0000000001e0d56cc7fffffffff1e0d56ca80000000001e0d56cb3fffffffff1e0d56ba40000000001e0d56babfffffffff1e0d56b940000000001e0d56b9bfffffffff1e0d56b640000000001e0d56b6bfffffffff1e0d56b540000000001e0d56b5bfffffffff1e0d56a4c0000000001e0d56a57fffffffff1e0d56a380000000001e0d56a43fffffffff1e0d545f80000000001e0d56a07fffffffff1e0d545bc0000000001e0d545c7fffffffff1e0d545a80000000001e0d545b3fffffffff1e0d544a40000000001e0d544abfffffffff1e0d544940000000001e0d5449bfffffffff1e0d544640000000001e0d5446bfffffffff1e0d544540000000001e0d5445bfffffffff1e0d5434c0000000001e0d54357fffffffff1e0d543380000000001e0d54343fffffffff1e0d540f80000000001e0d54307fffffffff1e0d540bc0000000001e0d540c7fffffffff1e0d540a80000000001e0d540b3fffffffff1e0803fa40000000001e0803fabfffffffff1e0803f940000000001e0803f9bfffffffff1e0803f640000000001e0803f6bfffffffff1e0803f540000000001e0803f5bfffffffff1e0803e4c0000000001e0803e57fffffffff1e0803e380000000001e0803e43fffffffff1e08031f80000000001e0803e07fffffffff1e08031bc0000000001e08031c7fffffffff1e08031a80000000001e08031b3fffffffff1e08030a40000000001e08030abfffffffff1e08030940000000001e080309bfffffffff1e08030640000000001e080306bfffffffff1e08030540000000001e080305bfffffffff1e0802f4c0000000001e0802f57fffffffff1e0802f380000000001e0802f43fffffffff1e0802cf80000000001e0802f07fffffffff1e0802cbc0000000001e0802cc7fffffffff1e0802ca80000000001e0802cb3fffffffff1e0802ba40000000001e0802babfffffffff1e0802b940000000001e0802b9bfffffffff1e0802b640000000001e0802b6bfffffffff1e0802b540000000001e0802b5bfffffffff1e08029fc0000000001e0802a03fffffffff1e08029e80000000001e08029f3fffffffff1e080290c0000000001e0802917fffffffff1e08026fc0000000001e0802903fffffffff1e08026e80000000001e08026f3fffffffff1e080260c0000000001e0802617fffffffff1e08025fc0000000001e0802603fffffffff1e08025e80000000001e08025f3fffffffff1e080250c0000000001e0802517fffffffff1e0801afc0000000001e0802503fffffffff1e0801ae80000000001e0801af3fffffffff1e0801a0c0000000001e0801a17fffffffff1e08019fc0000000001e0801a03fffffffff1e08019e80000000001e08019f3fffffffff1e080190c0000000001e0801917fffffffff1e08016fc0000000001e0801903fffffffff1e08016e80000000001e08016f3fffffffff1e080160c0000000001e0801617fffffffff1e08015fc0000000001e0801603fffffffff1e08015e80000000001e08015f3fffffffff1e080150c0000000001e0801517fffffffff1e07feafc0000000001e0801503fffffffff1e07feae80000000001e07feaf3fffffffff1e07fea0c0000000001e07fea17fffffffff1e07fe9fc0000000001e07fea03fffffffff1e07fe9e80000000001e07fe9f3fffffffff1e07fe90c0000000001e07fe917fffffffff1e07fe6fc0000000001e07fe903fffffffff1e07fe6e80000000001e07fe6f3fffffffff1e07fe60c0000000001e07fe617fffffffff1e07fe5fc0000000001e07fe603fffffffff1e07fe5e80000000001e07fe5f3fffffffff1e07fe50c0000000001e07fe517fffffffff1e07fdafc0000000001e07fe503fffffffff1e07fdae80000000001e07fdaf3fffffffff1e07fda0c0000000001e07fda17fffffffff1e07fd9fc0000000001e07fda03fffffffff1e07fd9e80000000001e07fd9f3fffffffff1e07fd90c0000000001e07fd917fffffffff1e07fd6fc0000000001e07fd903fffffffff1e07fd6e80000000001e07fd6f3fffffffff1e07fd60c0000000001e07fd617fffffffff1e07fd5fc0000000001e07fd603fffffffff1e07fd4a40000000001e07fd4abfffffffff1e07fd4940000000001e07fd49bfffffffff1e07fd4640000000001e07fd46bfffffffff1e07fd4540000000001e07fd45bfffffffff1e07fd34c0000000001e07fd357fffffffff1e07fd3380000000001e07fd343fffffffff1e07fd0f80000000001e07fd307fffffffff1e07fd0bc0000000001e07fd0c7fffffffff1e07fd0a80000000001e07fd0b3fffffffff1e07fcfa40000000001e07fcfabfffffffff1e07fcf940000000001e07fcf9bfffffffff1e07fcf640000000001e07fcf6bfffffffff1e07fcf540000000001e07fcf5bfffffffff1e07fce4c0000000001e07fce57fffffffff1e07fce380000000001e07fce43fffffffff1e07fc1f80000000001e07fce07fffffffff1e07fc1bc0000000001e07fc1c7fffffffff1e07fc1a80000000001e07fc1b3fffffffff1e07fc0a40000000001e07fc0abfffffffff1e07fc0940000000001e07fc09bfffffffff1e07fc0640000000001e07fc06bfffffffff1e07fc0540000000001e07fc05bfffffffff1e02abf4c0000000001e02abf57fffffffff1e02abf380000000001e02abf43fffffffff1e02abcf80000000001e02abf07fffffffff1e02abcbc0000000001e02abcc7fffffffff1e02abca80000000001e02abcb3fffffffff1e02abba40000000001e02abbabfffffffff1e02abb940000000001e02abb9bfffffffff1e02abb640000000001e02abb6bfffffffff1e02abb540000000001e02abb5bfffffffff1e02aba4c0000000001e02aba57fffffffff1e02aba380000000001e02aba43fffffffff1e02a95f80000000001e02aba07fffffffff1e02a95bc0000000001e02a95c7fffffffff1e02a95a80000000001e02a95b3fffffffff1e02a94a40000000001e02a94abfffffffff1e02a94940000000001e02a949bfffffffff1e02a94640000000001e02a946bfffffffff1e02a94540000000001e02a945bfffffffff1e02a934c0000000001e02a9357fffffffff1e02a93380000000001e02a9343fffffffff1e02a90f80000000001e02a9307fffffffff1e02a90bc0000000001e02a90c7fffffffff1e02a90a80000000001e02a90b3fffffffff1e02a8fa40000000001e02a8fabfffffffff1e02a8f940000000001e02a8f9bfffffffff1e02a8f640000000001e02a8f6bfffffffff1e02a8f540000000001e02a8f5bfffffffff1e02a8e4c0000000001e02a8e57fffffffff1e02a8e380000000001e02a8e43fffffffff1e02a85f80000000001e02a8e07fffffffff1e02a85bc0000000001e02a85c7fffffffff1e02a85a80000000001e02a85b3fffffffff1e02a84a40000000001e02a84abfffffffff1e02a84940000000001e02a849bfffffffff1e02a84640000000001e02a846bfffffffff1e02a84540000000001e02a845bfffffffff1e02a834c0000000001e02a8357fffffffff1e02a83380000000001e02a8343fffffffff1e02a81f80000000001e02a8307fffffffff1e02a81bc0000000001e02a81c7fffffffff1e02a81a80000000001e02a81b3fffffffff1e02a80a80000000001e02a80abfffffffff1f355fd500000000001f355fd5ffffffffff1f355f9300000000001f355f95ffffffffff1f355f8e00000000001f355f90ffffffffff1f355f3e00000000001f355f81ffffffffff1f355f2f00000000001f355f31ffffffffff1f355f2a00000000001f355f2cffffffffff1f355ee900000000001f355eeaffffffffff1f355ee500000000001f355ee6ffffffffff1f355ed900000000001f355edaffffffffff1f355ed500000000001f355ed6ffffffffff1f355e9300000000001f355e95ffffffffff1f355e8e00000000001f355e90ffffffffff1f355c7e00000000001f355e81ffffffffff1f355c6f00000000001f355c71ffffffffff1f355c6a00000000001f355c6cffffffffff1f355c2900000000001f355c2affffffffff1f355c2500000000001f355c26ffffffffff1f355c1900000000001f355c1affffffffff1f355c1500000000001f355c16ffffffffff1f355bd300000000001f355bd5ffffffffff1f355bce00000000001f355bd0ffffffffff1f355b3e00000000001f355bc1ffffffffff1f355b2f00000000001f355b31ffffffffff1f355b2a00000000001f355b2cffffffffff1f355ae900000000001f355aeaffffffffff1f355ae500000000001f355ae6ffffffffff1f355ad900000000001f355adaffffffffff1f355ad500000000001f355ad6ffffffffff1f355a9300000000001f355a95ffffffffff1f355a8e00000000001f355a90ffffffffff1f35517e00000000001f355a81ffffffffff1f35516f00000000001f355171ffffffffff1f35516a00000000001f35516cffffffffff1f35512900000000001f35512affffffffff1f35512500000000001f355126ffffffffff1f35511900000000001f35511affffffffff1f35511500000000001f355116ffffffffff1f3550d300000000001f3550d5ffffffffff1f3550ce00000000001f3550d0ffffffffff1f35503e00000000001f3550c1ffffffffff1f35502f00000000001f355031ffffffffff1f35502a00000000001f35502cffffffffff1f200fe900000000001f200feaffffffffff1f200fe500000000001f200fe6ffffffffff1f200fd900000000001f200fdaffffffffff1f200fd500000000001f200fd6ffffffffff1f200f9300000000001f200f95ffffffffff1f200f8e00000000001f200f90ffffffffff1f200c7e00000000001f200f81ffffffffff1f200c6f00000000001f200c71ffffffffff1f200c6a00000000001f200c6cffffffffff1f200c2900000000001f200c2affffffffff1f200c2500000000001f200c26ffffffffff1f200c1900000000001f200c1affffffffff1f200c1500000000001f200c16ffffffffff1f200bd300000000001f200bd5ffffffffff1f200bce00000000001f200bd0ffffffffff1f200b3e00000000001f200bc1ffffffffff1f200b2f00000000001f200b31ffffffffff1f200b2a00000000001f200b2cffffffffff1f200ae900000000001f200aeaffffffffff1f200ae500000000001f200ae6ffffffffff1f200ad900000000001f200adaffffffffff1f200ad500000000001f200ad6ffffffffff1f200a7f00000000001f200a80ffffffffff1f200a7a00000000001f200a7cffffffffff1f200a4300000000001f200a45ffffffffff1f2009bf00000000001f200a40ffffffffff1f2009ba00000000001f2009bcffffffffff1f20098300000000001f200985ffffffffff1f20097f00000000001f200980ffffffffff1f20097a00000000001f20097cffffffffff1f20094300000000001f200945ffffffffff1f2006bf00000000001f200940ffffffffff1f2006ba00000000001f2006bcffffffffff1f20068300000000001f200685ffffffffff1f20067f00000000001f200680ffffffffff1f20067a00000000001f20067cffffffffff1f20064300000000001f200645ffffffffff1f2005bf00000000001f200640ffffffffff1f2005ba00000000001f2005bcffffffffff1f20058300000000001f200585ffffffffff1f20057f00000000001f200580ffffffffff1f20057a00000000001f20057cffffffffff1f20054300000000001f200545ffffffffff1f1ffabf00000000001f200540ffffffffff1f1ffaba00000000001f1ffabcffffffffff1f1ffa8300000000001f1ffa85ffffffffff1f1ffa7f00000000001f1ffa80ffffffffff1f1ffa7a00000000001f1ffa7cffffffffff1f1ffa4300000000001f1ffa45ffffffffff1f1ff9bf00000000001f1ffa40ffffffffff1f1ff9ba00000000001f1ff9bcffffffffff1f1ff98300000000001f1ff985ffffffffff1f1ff97f00000000001f1ff980ffffffffff1f1ff97a00000000001f1ff97cffffffffff1f1ff94300000000001f1ff945ffffffffff1f1ff6bf00000000001f1ff940ffffffffff1f1ff6ba00000000001f1ff6bcffffffffff1f1ff68300000000001f1ff685ffffffffff1f1ff67f00000000001f1ff680ffffffffff1f1ff67a00000000001f1ff67cffffffffff1f1ff64300000000001f1ff645ffffffffff1f1ff5bf00000000001f1ff640ffffffffff1f1ff5ba00000000001f1ff5bcffffffffff1f1ff58300000000001f1ff585ffffffffff1f1ff57f00000000001f1ff580ffffffffff1f1ff52900000000001f1ff52affffffffff1f1ff52500000000001f1ff526ffffffffff1f1ff51900000000001f1ff51affffffffff1f1ff51500000000001f1ff516ffffffffff1f1ff4d300000000001f1ff4d5ffffffffff1f1ff4ce00000000001f1ff4d0ffffffffff1f1ff43e00000000001f1ff4c1ffffffffff1f1ff42f00000000001f1ff431ffffffffff1f1ff42a00000000001f1ff42cffffffffff1f1ff3e900000000001f1ff3eaffffffffff1f1ff3e500000000001f1ff3e6ffffffffff1f1ff3d900000000001f1ff3daffffffffff1f1ff3d500000000001f1ff3d6ffffffffff1f1ff39300000000001f1ff395ffffffffff1f1ff38e00000000001f1ff390ffffffffff1f1ff07e00000000001f1ff381ffffffffff1f1ff06f00000000001f1ff071ffffffffff1f1ff06a00000000001f1ff06cffffffffff1f1ff02900000000001f1ff02affffffffff1f1ff02500000000001f1ff026ffffffffff1f1ff01900000000001f1ff01affffffffff1f1ff01500000000001f1ff016ffffffffff1f0aafd300000000001f0aafd5ffffffffff1f0aafce00000000001f0aafd0ffffffffff1f0aaf3e00000000001f0aafc1ffffffffff1f0aaf2f00000000001f0aaf31ffffffffff1f0aaf2a00000000001f0aaf2cffffffffff1f0aaee900000000001f0aaeeaffffffffff1f0aaee500000000001f0aaee6ffffffffff1f0aaed900000000001f0aaedaffffffffff1f0aaed500000000001f0aaed6ffffffffff1f0aae9300000000001f0aae95ffffffffff1f0aae8e00000000001f0aae90ffffffffff1f0aa57e00000000001f0aae81ffffffffff1f0aa56f00000000001f0aa571ffffffffff1f0aa56a00000000001f0aa56cffffffffff1f0aa52900000000001f0aa52affffffffff1f0aa52500000000001f0aa526ffffffffff1f0aa51900000000001f0aa51affffffffff1f0aa51500000000001f0aa516ffffffffff1f0aa4d300000000001f0aa4d5ffffffffff1f0aa4ce00000000001f0aa4d0ffffffffff1f0aa43e00000000001f0aa4c1ffffffffff1f0aa42f00000000001f0aa431ffffffffff1f0aa42a00000000001f0aa42cffffffffff1f0aa3e900000000001f0aa3eaffffffffff1f0aa3e500000000001f0aa3e6ffffffffff1f0aa3d900000000001f0aa3daffffffffff1f0aa3d500000000001f0aa3d6ffffffffff1f0aa39300000000001f0aa395ffffffffff1f0aa38e00000000001f0aa390ffffffffff1f0aa17e00000000001f0aa381ffffffffff1f0aa16f00000000001f0aa171ffffffffff1f0aa16a00000000001f0aa16cffffffffff1f0aa12900000000001f0aa12affffffffff1f0aa12500000000001f0aa126ffffffffff1f0aa11900000000001f0aa11affffffffff1f0aa11500000000001f0aa116ffffffffff1f0aa0d300000000001f0aa0d5ffffffffff1f0aa0ce00000000001f0aa0d0ffffffffff1f0aa07e00000000001f0aa0c1ffffffffff1f0aa06f00000000001f0aa071ffffffffff1f0aa06a00000000001f0aa06cffffffffff1f0aa02a00000000001f0aa02affffffffff \ No newline at end of file diff --git a/core/utils/type-utils/src/test/resources/datawave/data/normalizer/pointRanges.txt b/core/utils/type-utils/src/test/resources/datawave/data/normalizer/pointRanges.txt new file mode 100644 index 00000000000..ca8c5933621 --- /dev/null +++ b/core/utils/type-utils/src/test/resources/datawave/data/normalizer/pointRanges.txt @@ -0,0 +1 @@ +1f355fd500000000001f355fd5ffffffffff1f355f9300000000001f355f95ffffffffff1f355f8e00000000001f355f90ffffffffff1f355f3e00000000001f355f81ffffffffff1f355f2f00000000001f355f31ffffffffff1f355f2a00000000001f355f2cffffffffff1f355ee900000000001f355eeaffffffffff1f355ee500000000001f355ee6ffffffffff1f355ed900000000001f355edaffffffffff1f355ed500000000001f355ed6ffffffffff1f355e9300000000001f355e95ffffffffff1f355e8e00000000001f355e90ffffffffff1f355c7e00000000001f355e81ffffffffff1f355c6f00000000001f355c71ffffffffff1f355c6a00000000001f355c6cffffffffff1f355c2900000000001f355c2affffffffff1f355c2500000000001f355c26ffffffffff1f355c1900000000001f355c1affffffffff1f355c1500000000001f355c16ffffffffff1f355bd300000000001f355bd5ffffffffff1f355bce00000000001f355bd0ffffffffff1f355b3e00000000001f355bc1ffffffffff1f355b2f00000000001f355b31ffffffffff1f355b2a00000000001f355b2cffffffffff1f355ae900000000001f355aeaffffffffff1f355ae500000000001f355ae6ffffffffff1f355ad900000000001f355adaffffffffff1f355ad500000000001f355ad6ffffffffff1f355a9300000000001f355a95ffffffffff1f355a8e00000000001f355a90ffffffffff1f35517e00000000001f355a81ffffffffff1f35516f00000000001f355171ffffffffff1f35516a00000000001f35516cffffffffff1f35512900000000001f35512affffffffff1f35512500000000001f355126ffffffffff1f35511900000000001f35511affffffffff1f35511500000000001f355116ffffffffff1f3550d300000000001f3550d5ffffffffff1f3550ce00000000001f3550d0ffffffffff1f35503e00000000001f3550c1ffffffffff1f35502f00000000001f355031ffffffffff1f35502a00000000001f35502cffffffffff1f200fe900000000001f200feaffffffffff1f200fe500000000001f200fe6ffffffffff1f200fd900000000001f200fdaffffffffff1f200fd500000000001f200fd6ffffffffff1f200f9300000000001f200f95ffffffffff1f200f8e00000000001f200f90ffffffffff1f200c7e00000000001f200f81ffffffffff1f200c6f00000000001f200c71ffffffffff1f200c6a00000000001f200c6cffffffffff1f200c2900000000001f200c2affffffffff1f200c2500000000001f200c26ffffffffff1f200c1900000000001f200c1affffffffff1f200c1500000000001f200c16ffffffffff1f200bd300000000001f200bd5ffffffffff1f200bce00000000001f200bd0ffffffffff1f200b3e00000000001f200bc1ffffffffff1f200b2f00000000001f200b31ffffffffff1f200b2a00000000001f200b2cffffffffff1f200ae900000000001f200aeaffffffffff1f200ae500000000001f200ae6ffffffffff1f200ad900000000001f200adaffffffffff1f200ad500000000001f200ad6ffffffffff1f200a7f00000000001f200a80ffffffffff1f200a7a00000000001f200a7cffffffffff1f200a4300000000001f200a45ffffffffff1f2009bf00000000001f200a40ffffffffff1f2009ba00000000001f2009bcffffffffff1f20098300000000001f200985ffffffffff1f20097f00000000001f200980ffffffffff1f20097a00000000001f20097cffffffffff1f20094300000000001f200945ffffffffff1f2006bf00000000001f200940ffffffffff1f2006ba00000000001f2006bcffffffffff1f20068300000000001f200685ffffffffff1f20067f00000000001f200680ffffffffff1f20067a00000000001f20067cffffffffff1f20064300000000001f200645ffffffffff1f2005bf00000000001f200640ffffffffff1f2005ba00000000001f2005bcffffffffff1f20058300000000001f200585ffffffffff1f20057f00000000001f200580ffffffffff1f20057a00000000001f20057cffffffffff1f20054300000000001f200545ffffffffff1f1ffabf00000000001f200540ffffffffff1f1ffaba00000000001f1ffabcffffffffff1f1ffa8300000000001f1ffa85ffffffffff1f1ffa7f00000000001f1ffa80ffffffffff1f1ffa7a00000000001f1ffa7cffffffffff1f1ffa4300000000001f1ffa45ffffffffff1f1ff9bf00000000001f1ffa40ffffffffff1f1ff9ba00000000001f1ff9bcffffffffff1f1ff98300000000001f1ff985ffffffffff1f1ff97f00000000001f1ff980ffffffffff1f1ff97a00000000001f1ff97cffffffffff1f1ff94300000000001f1ff945ffffffffff1f1ff6bf00000000001f1ff940ffffffffff1f1ff6ba00000000001f1ff6bcffffffffff1f1ff68300000000001f1ff685ffffffffff1f1ff67f00000000001f1ff680ffffffffff1f1ff67a00000000001f1ff67cffffffffff1f1ff64300000000001f1ff645ffffffffff1f1ff5bf00000000001f1ff640ffffffffff1f1ff5ba00000000001f1ff5bcffffffffff1f1ff58300000000001f1ff585ffffffffff1f1ff57f00000000001f1ff580ffffffffff1f1ff52900000000001f1ff52affffffffff1f1ff52500000000001f1ff526ffffffffff1f1ff51900000000001f1ff51affffffffff1f1ff51500000000001f1ff516ffffffffff1f1ff4d300000000001f1ff4d5ffffffffff1f1ff4ce00000000001f1ff4d0ffffffffff1f1ff43e00000000001f1ff4c1ffffffffff1f1ff42f00000000001f1ff431ffffffffff1f1ff42a00000000001f1ff42cffffffffff1f1ff3e900000000001f1ff3eaffffffffff1f1ff3e500000000001f1ff3e6ffffffffff1f1ff3d900000000001f1ff3daffffffffff1f1ff3d500000000001f1ff3d6ffffffffff1f1ff39300000000001f1ff395ffffffffff1f1ff38e00000000001f1ff390ffffffffff1f1ff07e00000000001f1ff381ffffffffff1f1ff06f00000000001f1ff071ffffffffff1f1ff06a00000000001f1ff06cffffffffff1f1ff02900000000001f1ff02affffffffff1f1ff02500000000001f1ff026ffffffffff1f1ff01900000000001f1ff01affffffffff1f1ff01500000000001f1ff016ffffffffff1f0aafd300000000001f0aafd5ffffffffff1f0aafce00000000001f0aafd0ffffffffff1f0aaf3e00000000001f0aafc1ffffffffff1f0aaf2f00000000001f0aaf31ffffffffff1f0aaf2a00000000001f0aaf2cffffffffff1f0aaee900000000001f0aaeeaffffffffff1f0aaee500000000001f0aaee6ffffffffff1f0aaed900000000001f0aaedaffffffffff1f0aaed500000000001f0aaed6ffffffffff1f0aae9300000000001f0aae95ffffffffff1f0aae8e00000000001f0aae90ffffffffff1f0aa57e00000000001f0aae81ffffffffff1f0aa56f00000000001f0aa571ffffffffff1f0aa56a00000000001f0aa56cffffffffff1f0aa52900000000001f0aa52affffffffff1f0aa52500000000001f0aa526ffffffffff1f0aa51900000000001f0aa51affffffffff1f0aa51500000000001f0aa516ffffffffff1f0aa4d300000000001f0aa4d5ffffffffff1f0aa4ce00000000001f0aa4d0ffffffffff1f0aa43e00000000001f0aa4c1ffffffffff1f0aa42f00000000001f0aa431ffffffffff1f0aa42a00000000001f0aa42cffffffffff1f0aa3e900000000001f0aa3eaffffffffff1f0aa3e500000000001f0aa3e6ffffffffff1f0aa3d900000000001f0aa3daffffffffff1f0aa3d500000000001f0aa3d6ffffffffff1f0aa39300000000001f0aa395ffffffffff1f0aa38e00000000001f0aa390ffffffffff1f0aa17e00000000001f0aa381ffffffffff1f0aa16f00000000001f0aa171ffffffffff1f0aa16a00000000001f0aa16cffffffffff1f0aa12900000000001f0aa12affffffffff1f0aa12500000000001f0aa126ffffffffff1f0aa11900000000001f0aa11affffffffff1f0aa11500000000001f0aa116ffffffffff1f0aa0d300000000001f0aa0d5ffffffffff1f0aa0ce00000000001f0aa0d0ffffffffff1f0aa07e00000000001f0aa0c1ffffffffff1f0aa06f00000000001f0aa071ffffffffff1f0aa06a00000000001f0aa06cffffffffff1f0aa02a00000000001f0aa02affffffffff \ No newline at end of file diff --git a/core/utils/type-utils/src/test/resources/datawave/webservice/query/util/TypedValueExpectedEncoded.xml b/core/utils/type-utils/src/test/resources/datawave/webservice/query/util/TypedValueExpectedEncoded.xml new file mode 100644 index 00000000000..8ce52a33f50 --- /dev/null +++ b/core/utils/type-utils/src/test/resources/datawave/webservice/query/util/TypedValueExpectedEncoded.xml @@ -0,0 +1 @@ +{1} diff --git a/core/utils/type-utils/src/test/resources/datawave/webservice/query/util/TypedValueExpectedUnencoded.xml b/core/utils/type-utils/src/test/resources/datawave/webservice/query/util/TypedValueExpectedUnencoded.xml new file mode 100644 index 00000000000..d607f4d8ee7 --- /dev/null +++ b/core/utils/type-utils/src/test/resources/datawave/webservice/query/util/TypedValueExpectedUnencoded.xml @@ -0,0 +1 @@ +{1} \ No newline at end of file diff --git a/core/utils/type-utils/src/test/resources/log4j.properties b/core/utils/type-utils/src/test/resources/log4j.properties new file mode 100644 index 00000000000..cacd01b436c --- /dev/null +++ b/core/utils/type-utils/src/test/resources/log4j.properties @@ -0,0 +1,6 @@ +log4j.rootCategory=INFO, CONSOLE + +log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender +log4j.appender.CONSOLE.Threshold=INFO +log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout +log4j.appender.CONSOLE.layout.ConversionPattern=%-5p [%C{1}:%M] %m%n From c5ed79768e052128aaff37e126698a5113cb83e8 Mon Sep 17 00:00:00 2001 From: Ivan Bella <347158+ivakegg@users.noreply.github.com> Date: Mon, 25 Nov 2024 15:32:01 +0000 Subject: [PATCH 02/42] git subrepo clone git@github.com:NationalSecurityAgency/datawave-utils.git contrib/datawave-utils subrepo: subdir: "contrib/datawave-utils" merged: "4348fc36a3" upstream: origin: "git@github.com:NationalSecurityAgency/datawave-utils.git" branch: "main" commit: "4348fc36a3" git-subrepo: version: "0.4.9" origin: "https://github.com/ingydotnet/git-subrepo" commit: "cce3d93" --- .gitmodules | 3 - contrib/datawave-utils | 1 - contrib/datawave-utils/.gitignore | 9 + contrib/datawave-utils/.gitrepo | 12 + contrib/datawave-utils/LICENSE | 203 ++++++++++++ .../datawave-utils/assert-properties/pom.xml | 78 +++++ .../java/datawave/maven/AssertProperties.java | 214 +++++++++++++ contrib/datawave-utils/code-style/pom.xml | 44 +++ .../eclipse/Eclipse-Datawave-Codestyle.xml | 291 ++++++++++++++++++ .../eclipse/Eclipse-Datawave-Template.xml | 31 ++ .../datawave-utils/read-properties/pom.xml | 83 +++++ .../java/datawave/maven/ReadProperties.java | 172 +++++++++++ 12 files changed, 1137 insertions(+), 4 deletions(-) delete mode 160000 contrib/datawave-utils create mode 100644 contrib/datawave-utils/.gitignore create mode 100644 contrib/datawave-utils/.gitrepo create mode 100644 contrib/datawave-utils/LICENSE create mode 100644 contrib/datawave-utils/assert-properties/pom.xml create mode 100644 contrib/datawave-utils/assert-properties/src/main/java/datawave/maven/AssertProperties.java create mode 100644 contrib/datawave-utils/code-style/pom.xml create mode 100644 contrib/datawave-utils/code-style/src/main/resources/eclipse/Eclipse-Datawave-Codestyle.xml create mode 100644 contrib/datawave-utils/code-style/src/main/resources/eclipse/Eclipse-Datawave-Template.xml create mode 100644 contrib/datawave-utils/read-properties/pom.xml create mode 100644 contrib/datawave-utils/read-properties/src/main/java/datawave/maven/ReadProperties.java diff --git a/.gitmodules b/.gitmodules index 60d8ea4a4a8..70366e9ca36 100644 --- a/.gitmodules +++ b/.gitmodules @@ -58,9 +58,6 @@ [submodule "microservices/microservice-service-parent"] path = microservices/microservice-service-parent url = git@github.com:NationalSecurityAgency/datawave-service-parent.git -[submodule "contrib/datawave-utils"] - path = contrib/datawave-utils - url = git@github.com:NationalSecurityAgency/datawave-utils.git [submodule "microservices/starters/query"] path = microservices/starters/query url = git@github.com:NationalSecurityAgency/datawave-spring-boot-starter-query.git diff --git a/contrib/datawave-utils b/contrib/datawave-utils deleted file mode 160000 index 4348fc36a35..00000000000 --- a/contrib/datawave-utils +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 4348fc36a3519ca9e5d1b96ac47c3f0b64abe34b diff --git a/contrib/datawave-utils/.gitignore b/contrib/datawave-utils/.gitignore new file mode 100644 index 00000000000..6e170178597 --- /dev/null +++ b/contrib/datawave-utils/.gitignore @@ -0,0 +1,9 @@ +target/ + +.idea/ +*.iml +*.iws + +.classpath +.project +.settings/ diff --git a/contrib/datawave-utils/.gitrepo b/contrib/datawave-utils/.gitrepo new file mode 100644 index 00000000000..6708754832e --- /dev/null +++ b/contrib/datawave-utils/.gitrepo @@ -0,0 +1,12 @@ +; DO NOT EDIT (unless you know what you are doing) +; +; This subdirectory is a git "subrepo", and this file is maintained by the +; git-subrepo command. See https://github.com/ingydotnet/git-subrepo#readme +; +[subrepo] + remote = git@github.com:NationalSecurityAgency/datawave-utils.git + branch = main + commit = 4348fc36a3519ca9e5d1b96ac47c3f0b64abe34b + parent = 1f96d1af84c45f32007da0105b8eab514ec3f7d3 + method = merge + cmdver = 0.4.9 diff --git a/contrib/datawave-utils/LICENSE b/contrib/datawave-utils/LICENSE new file mode 100644 index 00000000000..6b0b1270ff0 --- /dev/null +++ b/contrib/datawave-utils/LICENSE @@ -0,0 +1,203 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/contrib/datawave-utils/assert-properties/pom.xml b/contrib/datawave-utils/assert-properties/pom.xml new file mode 100644 index 00000000000..cc29cca46ff --- /dev/null +++ b/contrib/datawave-utils/assert-properties/pom.xml @@ -0,0 +1,78 @@ + + + 4.0.0 + + gov.nsa.datawave.plugins + assert-properties + 2.0.2-SNAPSHOT + + maven-plugin + DataWave Assert Properties Plugin + Asserts that required maven properties have been defined. + https://code.nsa.gov/datawave-utils + + + The Apache License, Version 2.0 + https://www.apache.org/licenses/LICENSE-2.0.txt + + + + scm:git:https://github.com/NationalSecurityAgency/datawave-utils.git + scm:git:git@github.com:NationalSecurityAgency/datawave-utils.git + https://github.com/NationalSecurityAgency/datawave-utils + + + + github-datawave + GitHub Datawave Apache Maven Packages + https://maven.pkg.github.com/NationalSecurityAgency/datawave + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.1 + + 1.8 + 1.8 + true + UTF-8 + + + + org.apache.maven.plugins + maven-release-plugin + 2.5.3 + + @{project.artifactId}_@{project.version} + dist + -Ddist + + + + + + + + org.apache.maven + maven-core + 3.0.5 + + + org.apache.maven + maven-plugin-api + 3.0.5 + + + commons-lang + commons-lang + 2.6 + + + + diff --git a/contrib/datawave-utils/assert-properties/src/main/java/datawave/maven/AssertProperties.java b/contrib/datawave-utils/assert-properties/src/main/java/datawave/maven/AssertProperties.java new file mode 100644 index 00000000000..be266bfe140 --- /dev/null +++ b/contrib/datawave-utils/assert-properties/src/main/java/datawave/maven/AssertProperties.java @@ -0,0 +1,214 @@ +package datawave.maven; + +import org.apache.commons.lang.StringUtils; +import org.apache.maven.plugin.AbstractMojo; +import org.apache.maven.plugin.MojoExecutionException; +import org.apache.maven.plugin.MojoFailureException; +import org.apache.maven.project.MavenProject; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Properties; +import java.util.Set; + +/** + * @goal assert-properties + * @phase validate + * @threadSafe true + */ +@SuppressWarnings("unused") +public class AssertProperties extends AbstractMojo { + private static final Character COMMENT = '#', COMMA = ','; + + /** + * @parameter default-value="${project}" + * @required + * @readonly + */ + private MavenProject project; + + /** + * @parameter + * @required + */ + private File expectedPropertyNames; + + /** + * @parameter + */ + private File configuredPropertyNames; + + @Override + public void execute() throws MojoExecutionException, MojoFailureException { + validatePropertyNames(); + + Properties buildProps = getConfiguredProperties(); + + Map getExpectedPropertyMap = getExpectedPropertyMap(); + + // Retain only properties from the build environment that are Entry + Set propertyNames = new HashSet<>(); + for (Entry entry : buildProps.entrySet()) { + Object key = entry.getKey(), value = entry.getValue(); + + if (key instanceof String && value instanceof String) { + propertyNames.add((String)key); + } + } + + // Remove all provided properties + Set expectedProperties = getExpectedPropertyMap.keySet(); + expectedProperties.removeAll(propertyNames); + + if (expectedProperties.size() > 0) { + StringBuilder errorMessage = new StringBuilder(); + errorMessage.append(expectedProperties.size() + " properties were not provided:\n"); + for (Entry entry : getExpectedPropertyMap.entrySet()) { + errorMessage.append("Missing property: " + entry.getKey() + ", Description: " + entry.getValue()).append("\n"); + } + + throw new MojoFailureException(errorMessage.toString()); + } + } + + protected void validatePropertyNames() throws MojoExecutionException { + if (!this.expectedPropertyNames.isFile()) { + throw new MojoExecutionException("expectedPropertyNames must be a file"); + } + + if (null != this.configuredPropertyNames && !this.configuredPropertyNames.isFile()) { + throw new MojoExecutionException("configuredPropertyNames must be a file if provided"); + } + + } + + protected Properties getConfiguredProperties() throws MojoExecutionException { + Properties buildProps = project.getProperties(); + Properties envProps; + + if (null == this.configuredPropertyNames) { + envProps = buildProps; + } else { + FileReader propReader = null; + envProps = new Properties(); + try { + propReader = new FileReader(configuredPropertyNames); + envProps.load(propReader); + } catch (IOException e) { + throw new MojoExecutionException("Could not load configuredPropertyNames", e); + } finally { + // Make sure we don't leave any open file handles laying around + if (null != propReader) { + try { + propReader.close(); + } catch (IOException e) { + throw new MojoExecutionException("Could not load configuredPropertyNames", e); + } + } + } + } + + for (Entry entry : buildProps.entrySet()) { + envProps.put(entry.getKey(), entry.getValue()); + } + + return envProps; + } + + /** + * Fetch the set of strings from the configured filename + * @return + * @throws MojoExecutionException + * @throws IOException + */ + protected Map getExpectedPropertyMap() throws MojoExecutionException { + BufferedReader reader ; + try { + reader = new BufferedReader(new FileReader(this.expectedPropertyNames)); + } catch (FileNotFoundException e) { + getLog().warn("Could not read exepcted properties files"); + + throw new MojoExecutionException("Could not read expected properties file", e); + } + + HashMap expectedProperties = new HashMap<>(); + + String line = null; + try { + while ((line = reader.readLine()) != null) { + // Remove leading/trailing whitespace + line = line.trim(); + + // Ignore empty lines or those starting with a '#' + if (StringUtils.isBlank(line) || line.charAt(0) == COMMENT) { + continue; + } + + // Strip everything after a comma if it exists + int index = line.indexOf(COMMA); + String candidateName, candidateDescription; + + // Trim again to make sure we catch any "new" trailing whitespace + // after the property name but before where the comma was + if (index == -1) { + candidateName = line.trim(); + candidateDescription = ""; + } else { + candidateName = line.substring(0, index).trim(); + candidateDescription = line.substring(index + 1).trim(); + } + + // Add it to the expected set i the line still isn't blank + if (StringUtils.isNotBlank(candidateName)) { + expectedProperties.put(candidateName, candidateDescription); + } + } + } catch (IOException e) { + throw new MojoExecutionException("Could not read expected properties file", e); + } finally { + // Make sure we don't leave any open file handles laying around + try { + reader.close(); + } catch (IOException e) { + throw new MojoExecutionException("Could not close reader to expected properties", e); + } + } + + if (expectedProperties.isEmpty()) { + getLog().warn("No expected properties were loaded from " + this.expectedPropertyNames); + } + + return expectedProperties; + } + + public MavenProject getProject() { + return project; + } + + public void setProject(MavenProject project) { + this.project = project; + } + + public File getExpectedPropertyNames() { + return expectedPropertyNames; + } + + public void setExpectedPropertyNames(File expectedPropertyNames) { + this.expectedPropertyNames = expectedPropertyNames; + } + + public File getConfiguredPropertyNames() { + return configuredPropertyNames; + } + + public void setConfiguredPropertyNames(File configuredPropertyNames) { + this.configuredPropertyNames = configuredPropertyNames; + } +} diff --git a/contrib/datawave-utils/code-style/pom.xml b/contrib/datawave-utils/code-style/pom.xml new file mode 100644 index 00000000000..9febbb7c6ef --- /dev/null +++ b/contrib/datawave-utils/code-style/pom.xml @@ -0,0 +1,44 @@ + + + 4.0.0 + gov.nsa.datawave + datawave-code-style + 1.1-SNAPSHOT + jar + DataWave Code Formatter + This pom configures the formatter-maven-plugin to format the code + according to the DataWave code style. + https://code.nsa.gov/datawave-urils + + + The Apache License, Version 2.0 + https://www.apache.org/licenses/LICENSE-2.0.txt + + + + scm:git:https://github.com/NationalSecurityAgency/datawave-utils.git + scm:git:git@github.com:NationalSecurityAgency/datawave-utils.git + https://github.com/NationalSecurityAgency/datawave-utils + + + + github-datawave + GitHub Datawave Apache Maven Packages + https://maven.pkg.github.com/NationalSecurityAgency/datawave + + + + + + org.apache.maven.plugins + maven-release-plugin + 2.5.3 + + @{project.artifactId}_@{project.version} + dist + -Ddist + + + + + diff --git a/contrib/datawave-utils/code-style/src/main/resources/eclipse/Eclipse-Datawave-Codestyle.xml b/contrib/datawave-utils/code-style/src/main/resources/eclipse/Eclipse-Datawave-Codestyle.xml new file mode 100644 index 00000000000..71f031467b1 --- /dev/null +++ b/contrib/datawave-utils/code-style/src/main/resources/eclipse/Eclipse-Datawave-Codestyle.xml @@ -0,0 +1,291 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/contrib/datawave-utils/code-style/src/main/resources/eclipse/Eclipse-Datawave-Template.xml b/contrib/datawave-utils/code-style/src/main/resources/eclipse/Eclipse-Datawave-Template.xml new file mode 100644 index 00000000000..ae67af04738 --- /dev/null +++ b/contrib/datawave-utils/code-style/src/main/resources/eclipse/Eclipse-Datawave-Template.xml @@ -0,0 +1,31 @@ + + +