DMS_DESIGNATORS = Arrays.asList('n', 's', 'e', 'w');
+
+ public static double parseLatOrLon(String value) throws ParseException {
+ value = value.trim();
+
+ // If we were given a zero-length value, catch this ahead of time so we
+ // can throw a ParseException instead of an IndexOutOfBoundsException
+ if (value.isEmpty()) {
+ throw new ParseException("Could not normalize empty value as latitute or longitude");
+ }
+
+ // value may have been encoded
+ try {
+ if (NumericalEncoder.isPossiblyEncoded(value)) {
+ value = NumericalEncoder.decode(value).toPlainString();
+ }
+ } catch (Exception nfe) {
+ // ok, assume not normalized
+ }
+
+ char end = Character.toLowerCase(value.charAt(value.length() - 1));
+ if (DMS_DESIGNATORS.contains(end)) {
+ try {
+ return convertDMStoDD(value);
+ } catch (NormalizationException ne) {
+ throw new ParseException("Unable to convert DMS to DD format", ne);
+ }
+ } else {
+ try {
+ return parseDouble(value);
+ } catch (Exception nfe) {
+ throw new ParseException("Unable to parse lat or lon " + value, nfe);
+ }
+ }
+ }
+
+ /**
+ * Convert a Degrees / Minutes / Seconds latitude or longitude into decimal degrees.
+ *
+ * @param val
+ * @return
+ */
+ public static double convertDMStoDD(String val) throws NormalizationException {
+ try {
+ boolean negate = false;
+ double degrees = 0.0d;
+ double minutes = 0.0d;
+ double seconds = 0.0d;
+
+ val = val.trim();
+ char end = Character.toLowerCase(val.charAt(val.length() - 1));
+ if (end == 'n' || end == 'e') {
+ val = val.substring(0, val.length() - 1).trim();
+ } else if (end == 's' || end == 'w') {
+ val = val.substring(0, val.length() - 1).trim();
+ negate = true;
+ }
+
+ // see if it is already split up
+ if (val.indexOf(':') >= 0) {
+ String[] parts = Iterables.toArray(Splitter.on(':').split(val), String.class);
+ degrees = Double.parseDouble(parts[0]);
+ if (parts.length > 1) {
+ minutes = Double.parseDouble(parts[1]);
+ if (parts.length > 2) {
+ seconds = Double.parseDouble(parts[2]);
+ if (parts.length > 3) {
+ throw new NormalizationException("Do not know how to convert lat or lon value: " + val);
+ }
+ }
+ }
+ } else {
+ int point = val.indexOf('.');
+ if (point < 0)
+ point = val.length();
+ // if more than 3 digits, then we have minutes
+ if (point > 3) {
+ // if more than 5 digits, then we have seconds
+ if (point > 5) {
+ seconds = Double.parseDouble(val.substring(point - 2));
+ minutes = Double.parseDouble(val.substring(point - 4, point - 2));
+ degrees = Double.parseDouble(val.substring(0, point - 4));
+ } else {
+ minutes = Double.parseDouble(val.substring(point - 2));
+ degrees = Double.parseDouble(val.substring(0, point - 2));
+ }
+ } else {
+ degrees = Double.parseDouble(val);
+ }
+ }
+
+ double dd = degrees + (minutes / 60.0d) + (seconds / 3600.0d);
+ if (negate) {
+ dd = (0.0d - dd);
+ }
+
+ return dd;
+ } catch (Exception nfe) {
+ throw new NormalizationException("Failed to convert numeric value part of a lat or lon " + val, nfe);
+ }
+ }
+
+ public static double parseDouble(String val) throws ParseException {
+ double value = 0.0d;
+ try {
+ value = Double.parseDouble(val);
+ } catch (Exception e) {
+ if (NumericalEncoder.isPossiblyEncoded(val)) {
+ try {
+ value = NumericalEncoder.decode(val).doubleValue();
+ } catch (Exception e2) {
+ // Don't log, since it's expected that we'll sometimes use this normalizer and pass bad values
+ // when we need to run an unknown type of term through all normalizers.s
+ throw new ParseException("Failed to convert " + val + " into a double value", e2);
+ }
+ } else {
+ throw new ParseException("Unknown double format: " + val);
+ }
+ }
+ return value;
+ }
+
+ public String combineLatLon(double lat, double lon) throws OutOfRangeException {
+ return GeoPoint.getZRefStr(new GeoPoint(lat, lon));
+ }
+
+ /**
+ * Finds the first non numeric and non '.' character and returns its position.
+ *
+ * @param s
+ * @return
+ */
+ public int findSplit(String s) {
+ if (separator != null) {
+ int i = s.indexOf(separator);
+ if (i > 0) {
+ return i;
+ }
+ }
+ // search from the center for a non lat or lon character
+ for (int i = 0; i < s.length(); ++i) {
+ int side = (i % 2 == 0 ? -1 : 1);
+ int dist = (i + 1) / 2;
+ int index = (s.length() / 2) + (dist * side);
+ if (index >= s.length())
+ break;
+ char c = s.charAt(index);
+ if ((c > '9' || c < '0') && (c != '.' && c != '-' && c != '+') && (c != 'n' && c != 'N' && c != 's' && c != 'S')
+ && (c != 'e' && c != 'E' && c != 'w' && c != 'W')) {
+ return index;
+ }
+ }
+ return -1;
+ }
+
+ public static class GeoPoint {
+ private double latitude, longitude;
+
+ /**
+ * Creates a GeoPoint with a custom fraction precision.
+ *
+ * @param latitude
+ * @param longitude
+ */
+ public GeoPoint(double latitude, double longitude) throws OutOfRangeException {
+ this.latitude = latitude;
+ this.longitude = longitude;
+ validate();
+ }
+
+ /**
+ * Creates a GeoPoint with a custom fraction precision.
+ *
+ * @param latitude
+ * @param longitude
+ * @throws ParseException
+ */
+ public GeoPoint(String latitude, String longitude) throws OutOfRangeException, ParseException {
+ this.latitude = GeoNormalizer.parseDouble(latitude);
+ this.longitude = GeoNormalizer.parseDouble(longitude);
+ validate();
+ }
+
+ /**
+ * A validation routine that check the latitude and longitude ranges
+ *
+ * @throws IllegalArgumentException
+ * if an out of range is detected
+ */
+ public void validate() throws OutOfRangeException {
+ if (this.latitude < -90.0 || this.latitude > 90.0) {
+ throw new OutOfRangeException("Latitude is outside of valid range [-90, 90]: " + this.latitude + ", " + this.longitude);
+ }
+ if (this.longitude < -180.0 || this.longitude > 180.0) {
+ throw new OutOfRangeException("Longitude is outside of valid range [-180, 180]: " + this.latitude + ", " + this.longitude);
+ }
+ }
+
+ /**
+ * Returns an interlaced representation of the latitude and longitude. The latitude's normal range of -90:90 is shifted to 0:180 (+90) and the
+ * logitude's normal range of -180:180 has been shifted to 0:360.
+ *
+ * For example:
+ *
+ * {@code [45, -150] => [135, 30] => 103350..0000000000000000}
+ *
+ * @return
+ */
+ public static Text getZRef(GeoPoint p) {
+ double latShift = p.latitude + 90.0;
+ double lonShift = p.longitude + 180.0;
+
+ NumberFormat formatter = NumberFormat.getInstance();
+ formatter.setMaximumIntegerDigits(3);
+ formatter.setMinimumIntegerDigits(3);
+ formatter.setMaximumFractionDigits(5);
+ formatter.setMinimumFractionDigits(5);
+
+ String latS = formatter.format(latShift);
+ String lonS = formatter.format(lonShift);
+
+ byte[] buf = new byte[latS.length() * 2];
+ for (int i = 0; i < latS.length(); ++i) {
+ buf[2 * i] = (byte) latS.charAt(i);
+ buf[2 * i + 1] = (byte) lonS.charAt(i);
+ }
+
+ return new Text(buf);
+ }
+
+ /**
+ * Returns an interlaced representation of the latitude and longitude. The latitude's normal range of -90:90 is shifted to 0:180 (+90) and the
+ * logitude's normal range of -180:180 has been shifted to 0:360.
+ *
+ * For example:
+ *
+ * {@code [45, -150] => [135, 30] => 103350..0000000000000000}
+ *
+ * @return
+ */
+ public static String getZRefStr(GeoPoint p) {
+ double latShift = p.latitude + 90.0;
+ double lonShift = p.longitude + 180.0;
+
+ NumberFormat formatter = NumberFormat.getInstance();
+ formatter.setMaximumIntegerDigits(3);
+ formatter.setMinimumIntegerDigits(3);
+ formatter.setMaximumFractionDigits(5);
+ formatter.setMinimumFractionDigits(5);
+
+ String latS = formatter.format(latShift);
+ String lonS = formatter.format(lonShift);
+ StringBuilder sb = new StringBuilder(latS.length() * 2);
+
+ for (int i = 0; i < latS.length(); ++i) {
+ sb.append(latS.charAt(i));
+ sb.append(lonS.charAt(i));
+ }
+
+ return sb.toString();
+ }
+
+ /**
+ * Factory method for decoding a zReference from a Text object.
+ *
+ * @param zref
+ * @return
+ */
+ public static GeoPoint decodeZRef(Text zref) throws OutOfRangeException, ParseException {
+ StringBuilder latB = new StringBuilder();
+ StringBuilder lonB = new StringBuilder();
+
+ ByteBuffer data = ByteBuffer.wrap(zref.getBytes(), 0, zref.getLength());
+ boolean isLat = true;
+ while (data.hasRemaining()) {
+ if (isLat) {
+ latB.append((char) data.get());
+ } else {
+ lonB.append((char) data.get());
+ }
+ isLat = !isLat;
+ }
+
+ double lat = GeoNormalizer.parseDouble(latB.toString());
+ double lon = GeoNormalizer.parseDouble(lonB.toString());
+
+ return new GeoPoint(lat - 90.0, lon - 180.0);
+ }
+
+ /**
+ * Factory method for decoding a zReference from a Text object.
+ *
+ * @param zref
+ * @return
+ * @throws ParseException
+ */
+ public static GeoPoint decodeZRef(String zref) throws OutOfRangeException, ParseException {
+ StringBuilder latB = new StringBuilder();
+ StringBuilder lonB = new StringBuilder();
+
+ CharBuffer data = CharBuffer.wrap(zref);
+ boolean isLat = true;
+ while (data.hasRemaining()) {
+ if (isLat) {
+ latB.append(data.get());
+ } else {
+ lonB.append(data.get());
+ }
+ isLat = !isLat;
+ }
+
+ double lat = GeoNormalizer.parseDouble(latB.toString());
+ double lon = GeoNormalizer.parseDouble(lonB.toString());
+
+ return new GeoPoint(lat - 90.0, lon - 180.0);
+ }
+
+ /**
+ * Given a bounding box described by the lower left corner (boundMind) and the upper right corner (boundMax), this method tests whether or not the point
+ * is within that box.
+ *
+ * @param boundMin
+ * @param boundMax
+ * @return
+ */
+ public boolean within(GeoPoint boundMin, GeoPoint boundMax) {
+ return getLatitude() >= boundMin.getLatitude() && getLatitude() <= boundMax.getLatitude() && getLongitude() >= boundMin.getLongitude()
+ && getLongitude() <= boundMax.getLongitude();
+ }
+
+ public double getLatitude() {
+ return latitude;
+ }
+
+ public double getLongitude() {
+ return longitude;
+ }
+
+ @Override
+ public String toString() {
+ return "(" + getLongitude() + ", " + getLatitude() + ")";
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (o == null) {
+ return false;
+ } else if (o instanceof GeoPoint) {
+ GeoPoint ogp = (GeoPoint) o;
+ return latitude == ogp.latitude && longitude == ogp.longitude;
+ } else {
+ return super.equals(o);
+ }
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(getLatitude(), getLongitude());
+ }
+ }
+
+ public static class OutOfRangeException extends Exception {
+
+ public OutOfRangeException() {
+ super();
+ }
+
+ public OutOfRangeException(String message, Throwable cause) {
+ super(message, cause);
+ }
+
+ public OutOfRangeException(String message) {
+ super(message);
+ }
+
+ public OutOfRangeException(Throwable cause) {
+ super(cause);
+ }
+
+ }
+
+ public static class ParseException extends Exception {
+
+ public ParseException() {
+ super();
+ }
+
+ public ParseException(String message, Throwable cause) {
+ super(message, cause);
+ }
+
+ public ParseException(String message) {
+ super(message);
+ }
+
+ public ParseException(Throwable cause) {
+ super(cause);
+ }
+
+ }
+
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/GeometryNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/GeometryNormalizer.java
new file mode 100644
index 00000000000..1200261de97
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/GeometryNormalizer.java
@@ -0,0 +1,88 @@
+package datawave.data.normalizer;
+
+import java.util.List;
+
+import org.locationtech.geowave.core.geotime.index.dimension.LatitudeDefinition;
+import org.locationtech.geowave.core.geotime.index.dimension.LongitudeDefinition;
+import org.locationtech.geowave.core.index.NumericIndexStrategy;
+import org.locationtech.geowave.core.index.dimension.NumericDimensionDefinition;
+import org.locationtech.geowave.core.index.sfc.SFCFactory;
+import org.locationtech.geowave.core.index.sfc.tiered.TieredSFCIndexFactory;
+import org.locationtech.geowave.core.store.api.Index;
+import org.locationtech.geowave.core.store.index.CustomNameIndex;
+
+import com.google.common.collect.Lists;
+
+import datawave.data.type.util.Geometry;
+
+/**
+ * A normalizer that, given a parseable geometry string representing an arbitrary geometry, will perform GeoWave indexing with a multi-tiered spatial geowave
+ * index configuration
+ */
+public class GeometryNormalizer extends AbstractGeometryNormalizer implements OneToManyNormalizer {
+ private static final long serialVersionUID = 171360806347433135L;
+
+ // NOTE: If we change the index strategy, then we will need to update the validHash method appropriately.
+ // @formatter:off
+ public static final ThreadLocal indexStrategy = ThreadLocal.withInitial(GeometryNormalizer::createIndexStrategy);
+ // @formatter:on
+
+ public static final ThreadLocal index = ThreadLocal.withInitial(() -> new CustomNameIndex(indexStrategy.get(), null, "geometryIndex"));
+
+ protected static NumericIndexStrategy createIndexStrategy() {
+ // @formatter:off
+ return TieredSFCIndexFactory.createFullIncrementalTieredStrategy(
+ new NumericDimensionDefinition[]{
+ new LongitudeDefinition(),
+ new LatitudeDefinition(
+ true)
+ // just use the same range for latitude to make square sfc values in
+ // decimal degrees (EPSG:4326)
+ },
+ new int[]{
+ LONGITUDE_BITS,
+ LATITUDE_BITS
+ },
+ SFCFactory.SFCType.HILBERT);
+ // @formatter:on
+ }
+
+ public NumericIndexStrategy getIndexStrategy() {
+ // NOTE: If we change the index strategy, then we will need to update the validHash method appropriately.
+ return GeometryNormalizer.indexStrategy.get();
+ }
+
+ public static NumericIndexStrategy getGeometryIndexStrategy() {
+ return GeometryNormalizer.indexStrategy.get();
+ }
+
+ public Index getIndex() {
+ return index.get();
+ }
+
+ public static Index getGeometryIndex() {
+ return index.get();
+ }
+
+ @Override
+ public List normalizeToMany(String geoString) throws IllegalArgumentException {
+ if (validHash(geoString)) {
+ return Lists.newArrayList(geoString);
+ }
+ return normalizeDelegateTypeToMany(createDatawaveGeometry(parseGeometry(geoString)));
+ }
+
+ @Override
+ public List normalizeDelegateTypeToMany(Geometry geometry) {
+ List list = Lists.newArrayList();
+ for (byte[] one : getIndicesFromGeometry(geometry)) {
+ list.add(getEncodedStringFromIndexBytes(one));
+ }
+ return list;
+ }
+
+ protected datawave.data.type.util.Geometry createDatawaveGeometry(org.locationtech.jts.geom.Geometry geometry) {
+ return new datawave.data.type.util.Geometry(geometry);
+ }
+
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/HexStringNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/HexStringNormalizer.java
new file mode 100644
index 00000000000..762f09a5549
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/HexStringNormalizer.java
@@ -0,0 +1,74 @@
+package datawave.data.normalizer;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.lang3.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class HexStringNormalizer extends AbstractNormalizer {
+
+ private static final long serialVersionUID = -2056362158103923525L;
+ private static final Logger log = LoggerFactory.getLogger(HexStringNormalizer.class);
+ private final Pattern pattern;
+
+ public HexStringNormalizer() {
+ this("(0x)?([0-9a-fA-F]+)");
+ }
+
+ protected HexStringNormalizer(String regex) {
+ pattern = Pattern.compile(regex);
+ }
+
+ protected String getNormalizedHex(String hex) {
+ if (hex.length() % 2 == 0) {
+ return LC_NO_DIACRITICS_NORMALIZER.normalize(hex);
+ }
+
+ StringBuilder buf = new StringBuilder(hex.length() + 1);
+ return LC_NO_DIACRITICS_NORMALIZER.normalize(buf.append("0").append(hex).toString());
+ }
+
+ protected Matcher validate(String fieldValue) {
+ if (StringUtils.isEmpty(fieldValue)) {
+ logAndThrow("Field may not be null or empty.");
+ }
+
+ Matcher matcher = pattern.matcher(fieldValue);
+ if (!matcher.matches()) {
+ logAndThrow(String.format("Failed to normalize hex value : %s.", fieldValue));
+ }
+
+ return matcher;
+ }
+
+ @Override
+ public String normalize(String fieldValue) {
+ Matcher matcher = validate(fieldValue);
+
+ return getNormalizedHex(matcher.group(2));
+ }
+
+ private void logAndThrow(String msg) {
+ if (log.isDebugEnabled()) {
+ log.debug(msg);
+ }
+ throw new IllegalArgumentException(msg);
+ }
+
+ @Override
+ public String normalizeRegex(String fieldRegex) {
+ return normalize(fieldRegex);
+ }
+
+ @Override
+ public String normalizeDelegateType(String delegateIn) {
+ return normalize(delegateIn);
+ }
+
+ @Override
+ public String denormalize(String in) {
+ return in;
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/IpAddressNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/IpAddressNormalizer.java
new file mode 100644
index 00000000000..885632dfbe0
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/IpAddressNormalizer.java
@@ -0,0 +1,56 @@
+package datawave.data.normalizer;
+
+import org.apache.commons.net.util.SubnetUtils;
+
+import datawave.data.type.util.IpAddress;
+import datawave.query.parser.JavaRegexAnalyzer;
+import datawave.query.parser.JavaRegexAnalyzer.JavaRegexParseException;
+
+public class IpAddressNormalizer extends AbstractNormalizer {
+
+ private static final long serialVersionUID = 8604032745289485764L;
+
+ public String normalize(String fieldValue) {
+ try {
+ fieldValue = fieldValue.replaceAll(" ", "");
+ return IpAddress.parse(fieldValue).toZeroPaddedString();
+ } catch (IllegalArgumentException iae) {
+ throw new IpAddressNormalizer.Exception("Failed to normalize " + fieldValue + " as an IP");
+ }
+ }
+
+ /**
+ * Note that we really cannot normalize the regex here, so the regex must work against the normalized and unnormalized forms.
+ */
+ public String normalizeRegex(String fieldRegex) {
+ try {
+ return new JavaRegexAnalyzer(fieldRegex).getZeroPadIpRegex();
+ } catch (JavaRegexParseException jrpe) {
+ throw new IllegalArgumentException("Failed to parse ip regex " + fieldRegex, jrpe);
+ }
+ }
+
+ public String[] normalizeCidrToRange(String cidr) {
+ SubnetUtils subnetUtils = new SubnetUtils(cidr);
+ subnetUtils.setInclusiveHostCount(true);
+ SubnetUtils.SubnetInfo info = subnetUtils.getInfo();
+ return new String[] {normalize(info.getLowAddress()), normalize(info.getHighAddress())};
+ }
+
+ @Override
+ public String normalizeDelegateType(IpAddress delegateIn) {
+ return delegateIn.toZeroPaddedString();
+ }
+
+ @Override
+ public IpAddress denormalize(String in) {
+ return IpAddress.parse(in);
+ }
+
+ public static class Exception extends IllegalArgumentException {
+ public Exception(String message) {
+ super(message);
+ }
+ }
+
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/LcNoDiacriticsNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/LcNoDiacriticsNormalizer.java
new file mode 100644
index 00000000000..2dde04c4b98
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/LcNoDiacriticsNormalizer.java
@@ -0,0 +1,74 @@
+package datawave.data.normalizer;
+
+import java.text.Normalizer;
+import java.text.Normalizer.Form;
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import datawave.query.parser.JavaRegexAnalyzer;
+import datawave.query.parser.JavaRegexAnalyzer.JavaRegexParseException;
+
+/**
+ * A Normalizer which performs the following steps:
+ *
+ * - Unicode canonical decomposition ({@link Form#NFD})
+ * - Removal of diacritical marks
+ * - Unicode canonical composition ({@link Form#NFC})
+ * - lower casing in the {@link Locale#ENGLISH English local}
+ *
+ */
+public class LcNoDiacriticsNormalizer extends AbstractNormalizer {
+ private static final long serialVersionUID = -7922074256473963293L;
+ private static final Pattern diacriticals = Pattern.compile("\\p{InCombiningDiacriticalMarks}");
+
+ public String normalize(String fieldValue) {
+ if (null == fieldValue) {
+ return null;
+ }
+ String decomposed = Normalizer.normalize(fieldValue, Form.NFD);
+ String noDiacriticals = removeDiacriticalMarks(decomposed);
+ String recomposed = Normalizer.normalize(noDiacriticals, Form.NFC);
+ return recomposed.toLowerCase(Locale.ENGLISH);
+ }
+
+ private String removeDiacriticalMarks(String str) {
+ Matcher matcher = diacriticals.matcher(str);
+ return matcher.replaceAll("");
+ }
+
+ public String normalizeRegex(String fieldRegex) {
+ if (null == fieldRegex) {
+ return null;
+ }
+ String decomposed = Normalizer.normalize(fieldRegex, Form.NFD);
+ String noDiacriticals = removeDiacriticalMarks(decomposed);
+ String recomposed = Normalizer.normalize(noDiacriticals, Form.NFC);
+ try {
+ JavaRegexAnalyzer regex = new JavaRegexAnalyzer(recomposed);
+ regex.applyRegexCaseSensitivity(false);
+ return regex.getRegex();
+ } catch (JavaRegexParseException e) {
+ throw new IllegalArgumentException("Unable to parse regex " + fieldRegex, e);
+ }
+ }
+
+ @Override
+ public boolean normalizedRegexIsLossy(String regex) {
+ // Despite this normalizer actually being lossy, we are still
+ // returning false as users are used to overmatching when including
+ // diacritics or upper case letter. We may consider changing this
+ // down the road, but for now returning false.
+ return false;
+ }
+
+ @Override
+ public String normalizeDelegateType(String delegateIn) {
+ return normalize(delegateIn);
+ }
+
+ @Override
+ public String denormalize(String in) {
+ return in;
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/LcNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/LcNormalizer.java
new file mode 100644
index 00000000000..ab678ec0aa4
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/LcNormalizer.java
@@ -0,0 +1,50 @@
+package datawave.data.normalizer;
+
+import java.util.Locale;
+
+import datawave.query.parser.JavaRegexAnalyzer;
+import datawave.query.parser.JavaRegexAnalyzer.JavaRegexParseException;
+
+/**
+ *
+ */
+public class LcNormalizer extends AbstractNormalizer {
+
+ private static final long serialVersionUID = 8311875506912885780L;
+
+ public String normalize(String fieldValue) {
+ return fieldValue.toLowerCase(Locale.ENGLISH);
+ }
+
+ public String normalizeRegex(String fieldRegex) {
+ if (null == fieldRegex) {
+ return null;
+ }
+ try {
+ JavaRegexAnalyzer regex = new JavaRegexAnalyzer(fieldRegex);
+ regex.applyRegexCaseSensitivity(false);
+ return regex.getRegex();
+ } catch (JavaRegexParseException e) {
+ throw new IllegalArgumentException("Unable to parse regex " + fieldRegex, e);
+ }
+ }
+
+ @Override
+ public boolean normalizedRegexIsLossy(String regex) {
+ // Despite this normalizer actually being lossy, we are still
+ // returning false as users are used to overmatching when including
+ // diacritics or upper case letter. We may consider changing this
+ // down the road, but for now returning false.
+ return false;
+ }
+
+ @Override
+ public String normalizeDelegateType(String delegateIn) {
+ return normalize(delegateIn);
+ }
+
+ @Override
+ public String denormalize(String in) {
+ return in;
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/MacAddressNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/MacAddressNormalizer.java
new file mode 100644
index 00000000000..70631098f0e
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/MacAddressNormalizer.java
@@ -0,0 +1,116 @@
+package datawave.data.normalizer;
+
+import org.apache.commons.lang3.StringUtils;
+
+import com.google.common.base.Splitter;
+import com.google.common.collect.Iterables;
+
+public class MacAddressNormalizer extends AbstractNormalizer {
+
+ private static final long serialVersionUID = -2606365671421121859L;
+
+ public String normalize(String fieldValue) {
+
+ String mac = "";
+
+ String parts[] = Iterables.toArray(Splitter.on(':').split(fieldValue), String.class);
+ if (parts.length == 6) {
+ // Verify it is padded ie.e 11:01:00:11:11:11
+ // Return 11-01-00-11-11-11
+ return StringUtils.join(padWithZeros(parts), "-");
+ }
+
+ parts = Iterables.toArray(Splitter.on('-').split(fieldValue), String.class);
+ if (parts.length == 6) {
+
+ // Verify it is padded ie.e 11-01-00-11-11-11
+ // Return 11-01-00-11-11-11
+ return StringUtils.join(padWithZeros(parts), "-");
+ }
+
+ // 6 bytes for a macaddr
+
+ try {
+ long lData = Long.parseLong(fieldValue, 16);
+
+ if (!isMac(lData)) {
+ throw new IllegalArgumentException("Failed to normalize " + fieldValue + " as a MAC");
+ }
+
+ for (int i = 0; i < 6; i++) {
+ final String twoChars = Long.toHexString(lData & 0x00000000000000FFl);
+ lData = lData >> 8;
+ if (twoChars.length() == 1) {
+ mac = "0" + twoChars + mac;
+
+ } else {
+ mac = twoChars + mac;
+
+ }
+ mac = "-" + mac;
+ }
+ return (mac.substring(1));
+ } catch (NumberFormatException e) {
+ throw new IllegalArgumentException("Failed to normalize " + fieldValue + " as a MAC");
+ }
+ }
+
+ /**
+ * Note that we really cannot normalize the regex here, so the regex must work against the normalized and unnormalized forms.
+ */
+ public String normalizeRegex(String fieldRegex) {
+ return fieldRegex;
+ }
+
+ public static boolean isMac(Long lData) {
+
+ long mask = 0xFFFF000000000000l;
+
+ if ((lData & mask) != 0)
+ return false;
+
+ return true;
+
+ }
+
+ public static boolean isMac(String value) {
+
+ long lData;
+
+ try {
+ lData = Long.parseLong(value, 16);
+
+ } catch (Exception e) {
+ return false;
+ }
+
+ return isMac(lData);
+
+ }
+
+ private static String[] padWithZeros(String mac[]) {
+ String padded[] = new String[mac.length];
+
+ for (int i = 0; i < mac.length; i++) {
+ if (mac[i].length() == 1) {
+ padded[i] = "0" + mac[i];
+ } else {
+ padded[i] = new String(mac[i]);
+
+ }
+ }
+
+ return padded;
+
+ }
+
+ @Override
+ public String normalizeDelegateType(String delegateIn) {
+ return normalize(delegateIn);
+ }
+
+ @Override
+ public String denormalize(String in) {
+ return in;
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/NetworkNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/NetworkNormalizer.java
new file mode 100644
index 00000000000..444f9015d66
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/NetworkNormalizer.java
@@ -0,0 +1,46 @@
+package datawave.data.normalizer;
+
+public class NetworkNormalizer extends AbstractNormalizer {
+
+ private static final long serialVersionUID = 8279399353763569005L;
+
+ public String normalize(String fieldValue) {
+ String normed = fieldValue;
+
+ try {
+ normed = IP_ADDRESS_NORMALIZER.normalize(fieldValue);
+
+ } catch (Exception iae) {
+ /**
+ * try as a mac address
+ */
+ try {
+ normed = MAC_ADDRESS_NORMALIZER.normalize(fieldValue);
+ } catch (Exception e) {
+ /**
+ * ok, default to string normalization
+ */
+ normed = LC_NO_DIACRITICS_NORMALIZER.normalize(fieldValue);
+ }
+ }
+ return normed;
+ }
+
+ /**
+ * Note that we really cannot normalize the regex here, so the regex must work against the normalized and unnormalized forms.
+ */
+ public String normalizeRegex(String fieldRegex) {
+ return fieldRegex;
+ }
+
+ @Override
+ public String normalizeDelegateType(String delegateIn) {
+ return normalize(delegateIn);
+ }
+
+ @Override
+ public String denormalize(String in) {
+ return in;
+ }
+
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/NoOpNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/NoOpNormalizer.java
new file mode 100644
index 00000000000..5b8277cb3b6
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/NoOpNormalizer.java
@@ -0,0 +1,27 @@
+package datawave.data.normalizer;
+
+/**
+ *
+ */
+public class NoOpNormalizer extends AbstractNormalizer {
+
+ private static final long serialVersionUID = -2599171413081079348L;
+
+ public String normalize(String fieldValue) {
+ return fieldValue;
+ }
+
+ public String normalizeRegex(String fieldRegex) {
+ return fieldRegex;
+ }
+
+ @Override
+ public String normalizeDelegateType(String delegateIn) {
+ return normalize(delegateIn);
+ }
+
+ @Override
+ public String denormalize(String in) {
+ return in;
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/NormalizationException.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/NormalizationException.java
new file mode 100644
index 00000000000..f4727e15207
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/NormalizationException.java
@@ -0,0 +1,25 @@
+package datawave.data.normalizer;
+
+import java.io.Serializable;
+
+public class NormalizationException extends Exception implements Serializable {
+
+ private static final long serialVersionUID = -2700045630205135530L;
+
+ public NormalizationException() {
+ super();
+ }
+
+ public NormalizationException(String message, Throwable cause) {
+ super(message, cause);
+ }
+
+ public NormalizationException(String message) {
+ super(message);
+ }
+
+ public NormalizationException(Throwable cause) {
+ super(cause);
+ }
+
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/Normalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/Normalizer.java
new file mode 100644
index 00000000000..6ef0aaf407a
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/Normalizer.java
@@ -0,0 +1,42 @@
+package datawave.data.normalizer;
+
+import java.io.Serializable;
+import java.math.BigDecimal;
+import java.util.Collection;
+import java.util.Date;
+
+import datawave.data.type.util.Geometry;
+import datawave.data.type.util.IpAddress;
+import datawave.data.type.util.Point;
+
+public interface Normalizer extends Serializable {
+
+ Normalizer IP_ADDRESS_NORMALIZER = new IpAddressNormalizer();
+ Normalizer MAC_ADDRESS_NORMALIZER = new MacAddressNormalizer();
+ Normalizer LC_NO_DIACRITICS_NORMALIZER = new LcNoDiacriticsNormalizer();
+ Normalizer DATE_NORMALIZER = new DateNormalizer();
+ Normalizer RAW_DATE_NORMALIZER = new RawDateNormalizer();
+ Normalizer GEOMETRY_NORMALIZER = new GeometryNormalizer();
+ Normalizer GEO_LAT_NORMALIZER = new GeoLatNormalizer();
+ Normalizer GEO_LON_NORMALIZER = new GeoLonNormalizer();
+ Normalizer GEO_NORMALIZER = new GeoNormalizer();
+ Normalizer HEX_STRING_NORMALIZER = new HexStringNormalizer();
+ Normalizer LC_NORMALIZER = new LcNormalizer();
+ Normalizer NETWORK_NORMALIZER = new NetworkNormalizer();
+ Normalizer NUMBER_NORMALIZER = new NumberNormalizer();
+ Normalizer POINT_NORMALIZER = new PointNormalizer();
+ Normalizer TRIM_LEADING_ZEROS_NORMALIZER = new TrimLeadingZerosNormalizer();
+ Normalizer NOOP_NORMALIZER = new NoOpNormalizer();
+
+ String normalize(String in);
+
+ String normalizeDelegateType(T delegateIn);
+
+ T denormalize(String in);
+
+ String normalizeRegex(String in);
+
+ boolean normalizedRegexIsLossy(String in);
+
+ Collection expand(String in);
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/NumberNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/NumberNormalizer.java
new file mode 100644
index 00000000000..d1bb4eae6c2
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/NumberNormalizer.java
@@ -0,0 +1,66 @@
+package datawave.data.normalizer;
+
+import java.math.BigDecimal;
+
+import org.apache.log4j.Logger;
+
+import datawave.data.normalizer.regex.NumericRegexEncoder;
+import datawave.data.type.util.NumericalEncoder;
+
+public class NumberNormalizer extends AbstractNormalizer {
+
+ private static final long serialVersionUID = -2781476072987375820L;
+ private Logger log = Logger.getLogger(NumberNormalizer.class);
+
+ public String normalize(String fv) {
+ if (NumericalEncoder.isPossiblyEncoded(fv)) {
+ try {
+ NumericalEncoder.decode(fv);
+ return fv;
+ } catch (Exception e2) {
+ // no problem here, we will simply try to encode it below
+ }
+ }
+ try {
+ return NumericalEncoder.encode(fv);
+ } catch (Exception e) {
+ throw new IllegalArgumentException("Failed to normalize value as a number: " + fv);
+ }
+ }
+
+ /**
+ * We can support regex against numbers.
+ */
+ public String normalizeRegex(String fieldRegex) {
+ try {
+ return NumericRegexEncoder.encode(fieldRegex);
+ } catch (IllegalArgumentException e) {
+ log.debug("Failed to normalize numeric field pattern '" + fieldRegex + "', returning regex as is", e);
+ return fieldRegex;
+ }
+ }
+
+ public boolean normalizedRegexIsLossy(String untrimmedRegex) {
+ ZeroRegexStatus status = NumericRegexEncoder.getZeroRegexStatus(untrimmedRegex);
+
+ return (status.equals(ZeroRegexStatus.LEADING) || status.equals(ZeroRegexStatus.TRAILING));
+ }
+
+ @Override
+ public String normalizeDelegateType(BigDecimal delegateIn) {
+ return normalize(delegateIn.toString());
+ }
+
+ @Override
+ public BigDecimal denormalize(String in) {
+ if (NumericalEncoder.isPossiblyEncoded(in)) {
+ try {
+ return NumericalEncoder.decode(in);
+ } catch (NumberFormatException e) {
+ // not encoded...
+ }
+ }
+ return new BigDecimal(in);
+ }
+
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/OneToManyNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/OneToManyNormalizer.java
new file mode 100644
index 00000000000..1ad641ce852
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/OneToManyNormalizer.java
@@ -0,0 +1,10 @@
+package datawave.data.normalizer;
+
+import java.util.List;
+
+public interface OneToManyNormalizer extends Normalizer {
+
+ List normalizeToMany(String in);
+
+ List normalizeDelegateTypeToMany(T foo);
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/PointNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/PointNormalizer.java
new file mode 100644
index 00000000000..51075af9d70
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/PointNormalizer.java
@@ -0,0 +1,72 @@
+package datawave.data.normalizer;
+
+import org.locationtech.geowave.core.geotime.index.dimension.LatitudeDefinition;
+import org.locationtech.geowave.core.geotime.index.dimension.LongitudeDefinition;
+import org.locationtech.geowave.core.index.NumericIndexStrategy;
+import org.locationtech.geowave.core.index.dimension.NumericDimensionDefinition;
+import org.locationtech.geowave.core.index.sfc.SFCFactory;
+import org.locationtech.geowave.core.index.sfc.tiered.TieredSFCIndexFactory;
+import org.locationtech.geowave.core.store.api.Index;
+import org.locationtech.geowave.core.store.index.CustomNameIndex;
+
+import datawave.data.type.util.Point;
+
+/**
+ * A normalizer that, given a parseable geometry string representing a point geometry will perform GeoWave indexing with a single-tier spatial geowave index
+ * configuration
+ */
+public class PointNormalizer extends AbstractGeometryNormalizer {
+ private static final long serialVersionUID = 171360806347433135L;
+
+ // NOTE: If we change the index strategy, then we will need to update the validHash method appropriately.
+ // @formatter:off
+ public static final ThreadLocal indexStrategy = ThreadLocal.withInitial(PointNormalizer::createIndexStrategy);
+ // @formatter:on
+
+ protected static NumericIndexStrategy createIndexStrategy() {
+ // @formatter:off
+ return TieredSFCIndexFactory.createSingleTierStrategy(
+ new NumericDimensionDefinition[]{
+ new LongitudeDefinition(),
+ new LatitudeDefinition(
+ true)
+ // just use the same range for latitude to make square sfc values in
+ // decimal degrees (EPSG:4326)
+ },
+ new int[]{
+ LONGITUDE_BITS,
+ LATITUDE_BITS
+ },
+ SFCFactory.SFCType.HILBERT);
+ // @formatter:on
+ }
+
+ public static final ThreadLocal index = ThreadLocal.withInitial(() -> new CustomNameIndex(indexStrategy.get(), null, "pointIndex"));
+
+ public NumericIndexStrategy getIndexStrategy() {
+ // NOTE: If we change the index strategy, then we will need to update the validHash method appropriately.
+ return PointNormalizer.indexStrategy.get();
+ }
+
+ public static NumericIndexStrategy getPointIndexStrategy() {
+ return PointNormalizer.indexStrategy.get();
+ }
+
+ public Index getIndex() {
+ return index.get();
+ }
+
+ public static Index getPointIndex() {
+ return index.get();
+ }
+
+ protected Point createDatawaveGeometry(org.locationtech.jts.geom.Point geometry) {
+ return new Point(geometry);
+ }
+
+ @Override
+ public boolean validTier(short tier) {
+ return tier == 0x1f;
+ }
+
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/RawDateNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/RawDateNormalizer.java
new file mode 100644
index 00000000000..d318ec8f4be
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/RawDateNormalizer.java
@@ -0,0 +1,34 @@
+package datawave.data.normalizer;
+
+import java.util.Collection;
+
+public class RawDateNormalizer extends AbstractNormalizer {
+
+ private static final long serialVersionUID = -3268331784114135470L;
+ private DateNormalizer delegate = new DateNormalizer();
+
+ @Override
+ public String normalize(String fieldValue) {
+ return delegate.normalize(fieldValue);
+ }
+
+ public String normalizeRegex(String fieldRegex) {
+ return delegate.normalizeRegex(fieldRegex);
+ }
+
+ @Override
+ public String normalizeDelegateType(String delegateIn) {
+ return delegate.normalize(delegateIn);
+ }
+
+ @Override
+ public String denormalize(String in) {
+ return in;
+ }
+
+ @Override
+ public Collection expand(String dateString) {
+ return delegate.expand(dateString);
+ }
+
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/TrimLeadingZerosNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/TrimLeadingZerosNormalizer.java
new file mode 100644
index 00000000000..2adb0ff6002
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/TrimLeadingZerosNormalizer.java
@@ -0,0 +1,34 @@
+package datawave.data.normalizer;
+
+public class TrimLeadingZerosNormalizer extends AbstractNormalizer {
+
+ private static final long serialVersionUID = -5681890794025882300L;
+
+ public String normalize(String fv) {
+ int len = fv.length();
+ int index;
+ for (index = 0; (index < len) && (fv.charAt(index) == '0'); index++)
+ ;
+ if (index > 0) {
+ fv = fv.substring(index);
+ }
+ return fv;
+ }
+
+ /**
+ * Note that we really cannot normalize the regex here, so the regex must work against the normalized and unnormalized forms.
+ */
+ public String normalizeRegex(String fieldRegex) {
+ return fieldRegex;
+ }
+
+ @Override
+ public String normalizeDelegateType(String delegateIn) {
+ return normalize(delegateIn);
+ }
+
+ @Override
+ public String denormalize(String in) {
+ return in;
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/ZeroRegexStatus.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/ZeroRegexStatus.java
new file mode 100644
index 00000000000..2e6f43a155d
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/ZeroRegexStatus.java
@@ -0,0 +1,5 @@
+package datawave.data.normalizer;
+
+public enum ZeroRegexStatus {
+ LEADING, TRAILING, NONE
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/AlternationNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/AlternationNode.java
new file mode 100644
index 00000000000..8be4b30fa39
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/AlternationNode.java
@@ -0,0 +1,38 @@
+package datawave.data.normalizer.regex;
+
+import java.util.Collection;
+import java.util.Map;
+
+import datawave.data.normalizer.regex.visitor.Visitor;
+
+/**
+ * Represents a regex alternation, i.e. {@code |}.
+ */
+public class AlternationNode extends Node {
+
+ public AlternationNode() {}
+
+ public AlternationNode(Collection extends Node> children) {
+ super(children);
+ }
+
+ public AlternationNode(Map properties) {
+ super(properties);
+ }
+
+ @Override
+ public NodeType getType() {
+ return NodeType.ALTERNATION;
+ }
+
+ @Override
+ public Object accept(Visitor visitor, Object data) {
+ return visitor.visitAlternation(this, data);
+ }
+
+ @Override
+ public AlternationNode shallowCopy() {
+ return new AlternationNode(this.properties);
+ }
+
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/AnyCharNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/AnyCharNode.java
new file mode 100644
index 00000000000..5c30c3f90e0
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/AnyCharNode.java
@@ -0,0 +1,32 @@
+package datawave.data.normalizer.regex;
+
+import java.util.Map;
+
+import datawave.data.normalizer.regex.visitor.Visitor;
+
+/**
+ * Represents a dot in a regex pattern.
+ */
+public class AnyCharNode extends Node {
+
+ public AnyCharNode() {}
+
+ public AnyCharNode(Map properties) {
+ super(properties);
+ }
+
+ @Override
+ public NodeType getType() {
+ return NodeType.ANY_CHAR;
+ }
+
+ @Override
+ public Object accept(Visitor visitor, Object data) {
+ return visitor.visitAnyChar(this, data);
+ }
+
+ @Override
+ public AnyCharNode shallowCopy() {
+ return new AnyCharNode(this.properties);
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/CharClassNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/CharClassNode.java
new file mode 100644
index 00000000000..0c6350c4b22
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/CharClassNode.java
@@ -0,0 +1,48 @@
+package datawave.data.normalizer.regex;
+
+import java.util.Map;
+import java.util.Objects;
+
+import datawave.data.normalizer.regex.visitor.Visitor;
+
+/**
+ * Represents a character class in a regex pattern encapsulated by {@code [...]}.
+ */
+public class CharClassNode extends Node {
+
+ public static final String PROPERTY_NEGATED = "negated";
+ private static final String TRUE = String.valueOf(true);
+
+ public CharClassNode() {}
+
+ public CharClassNode(boolean negated) {
+ setProperty(PROPERTY_NEGATED, String.valueOf(negated));
+ }
+
+ public CharClassNode(Map properties) {
+ super(properties);
+ }
+
+ public boolean isNegated() {
+ return hasProperty(PROPERTY_NEGATED) && getProperty(PROPERTY_NEGATED).equals(TRUE);
+ }
+
+ public void negate() {
+ setProperty(PROPERTY_NEGATED, TRUE);
+ }
+
+ @Override
+ public NodeType getType() {
+ return NodeType.CHAR_CLASS;
+ }
+
+ @Override
+ public Object accept(Visitor visitor, Object data) {
+ return visitor.visitCharClass(this, data);
+ }
+
+ @Override
+ public CharClassNode shallowCopy() {
+ return new CharClassNode(this.properties);
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/CharRangeNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/CharRangeNode.java
new file mode 100644
index 00000000000..4538c3d7794
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/CharRangeNode.java
@@ -0,0 +1,59 @@
+package datawave.data.normalizer.regex;
+
+import java.util.Map;
+import java.util.Objects;
+import java.util.StringJoiner;
+
+import datawave.data.normalizer.regex.visitor.Visitor;
+
+/**
+ * Represents a character range defined in a character class in a regex pattern.
+ */
+public class CharRangeNode extends Node {
+
+ public static final String PROPERTY_START = "start";
+ public static final String PROPERTY_END = "end";
+
+ public CharRangeNode() {}
+
+ public CharRangeNode(Map properties) {
+ super(properties);
+ }
+
+ public CharRangeNode(char start, char end) {
+ setStart(start);
+ setEnd(end);
+ }
+
+ public char getStart() {
+ return getProperty(PROPERTY_START).charAt(0);
+ }
+
+ public void setStart(char start) {
+ setProperty(PROPERTY_START, String.valueOf(start));
+ }
+
+ public char getEnd() {
+ return getProperty(PROPERTY_END).charAt(0);
+ }
+
+ public void setEnd(char end) {
+ setProperty(PROPERTY_END, String.valueOf(end));
+ }
+
+ @Override
+ public NodeType getType() {
+ return NodeType.CHAR_RANGE;
+ }
+
+ @Override
+ public Object accept(Visitor visitor, Object data) {
+ return visitor.visitCharRange(this, data);
+ }
+
+ @Override
+ public CharRangeNode shallowCopy() {
+ return new CharRangeNode(this.properties);
+ }
+
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/DigitCharClassNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/DigitCharClassNode.java
new file mode 100644
index 00000000000..d77e2ccab97
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/DigitCharClassNode.java
@@ -0,0 +1,34 @@
+package datawave.data.normalizer.regex;
+
+import java.util.Map;
+
+import datawave.data.normalizer.regex.visitor.Visitor;
+
+/**
+ * Represents the digit character class {@code \d} in a regex pattern.
+ */
+public class DigitCharClassNode extends Node {
+
+ protected DigitCharClassNode() {
+ super();
+ }
+
+ public DigitCharClassNode(Map properties) {
+ super(properties);
+ }
+
+ @Override
+ public NodeType getType() {
+ return NodeType.DIGIT_CHAR_CLASS;
+ }
+
+ @Override
+ public Object accept(Visitor visitor, Object data) {
+ return visitor.visitDigitChar(this, data);
+ }
+
+ @Override
+ public DigitCharClassNode shallowCopy() {
+ return new DigitCharClassNode(this.properties);
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EmptyNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EmptyNode.java
new file mode 100644
index 00000000000..f0aa0f983aa
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EmptyNode.java
@@ -0,0 +1,32 @@
+package datawave.data.normalizer.regex;
+
+import java.util.Map;
+
+import datawave.data.normalizer.regex.visitor.Visitor;
+
+/**
+ * Placeholder empty node for empty groups or empty alternation branches.
+ */
+public class EmptyNode extends Node {
+
+ public EmptyNode() {}
+
+ public EmptyNode(Map properties) {
+ super(properties);
+ }
+
+ @Override
+ public NodeType getType() {
+ return NodeType.EMPTY;
+ }
+
+ @Override
+ public Object accept(Visitor visitor, Object data) {
+ return visitor.visitEmpty(this, data);
+ }
+
+ @Override
+ public Node shallowCopy() {
+ return new EmptyNode(this.properties);
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EncodedNumberNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EncodedNumberNode.java
new file mode 100644
index 00000000000..7fdfa375996
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EncodedNumberNode.java
@@ -0,0 +1,37 @@
+package datawave.data.normalizer.regex;
+
+import java.util.Collection;
+import java.util.Map;
+
+import datawave.data.normalizer.regex.visitor.Visitor;
+
+/**
+ * Represents an encoded simple number in a regex tree.
+ */
+public class EncodedNumberNode extends Node {
+
+ public EncodedNumberNode() {}
+
+ public EncodedNumberNode(Collection children) {
+ addChildren(children);
+ }
+
+ public EncodedNumberNode(Map properties) {
+ super(properties);
+ }
+
+ @Override
+ public NodeType getType() {
+ return NodeType.ENCODED_NUMBER;
+ }
+
+ @Override
+ public Object accept(Visitor visitor, Object data) {
+ return visitor.visitEncodedNumber(this, data);
+ }
+
+ @Override
+ public Node shallowCopy() {
+ return new EncodedNumberNode(this.properties);
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EncodedPatternNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EncodedPatternNode.java
new file mode 100644
index 00000000000..ce40d4ec7d8
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EncodedPatternNode.java
@@ -0,0 +1,41 @@
+package datawave.data.normalizer.regex;
+
+import java.util.Collection;
+import java.util.Map;
+
+import datawave.data.normalizer.regex.visitor.Visitor;
+
+/**
+ * Represents an encoded pattern in a regex tree.
+ */
+public class EncodedPatternNode extends Node {
+
+ public EncodedPatternNode() {}
+
+ public EncodedPatternNode(Node child) {
+ super(child);
+ }
+
+ public EncodedPatternNode(Collection extends Node> children) {
+ super(children);
+ }
+
+ public EncodedPatternNode(Map properties) {
+ super(properties);
+ }
+
+ @Override
+ public NodeType getType() {
+ return NodeType.ENCODED_PATTERN;
+ }
+
+ @Override
+ public Object accept(Visitor visitor, Object data) {
+ return visitor.visitEncodedPattern(this, data);
+ }
+
+ @Override
+ public Node shallowCopy() {
+ return new EncodedPatternNode(this.properties);
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EndAnchorNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EndAnchorNode.java
new file mode 100644
index 00000000000..e645d597ece
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EndAnchorNode.java
@@ -0,0 +1,32 @@
+package datawave.data.normalizer.regex;
+
+import java.util.Map;
+
+import datawave.data.normalizer.regex.visitor.Visitor;
+
+/**
+ * Represents a regex end anchor, i.e. {@code $}.
+ */
+public class EndAnchorNode extends Node {
+
+ public EndAnchorNode() {}
+
+ public EndAnchorNode(Map properties) {
+ super(properties);
+ }
+
+ @Override
+ public NodeType getType() {
+ return NodeType.END_ANCHOR;
+ }
+
+ @Override
+ public Object accept(Visitor visitor, Object data) {
+ return visitor.visitEndAnchor(this, data);
+ }
+
+ @Override
+ public Node shallowCopy() {
+ return new EndAnchorNode(this.properties);
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EscapedSingleCharNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EscapedSingleCharNode.java
new file mode 100644
index 00000000000..1f4ae4d9654
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/EscapedSingleCharNode.java
@@ -0,0 +1,46 @@
+package datawave.data.normalizer.regex;
+
+import java.util.Map;
+
+import datawave.data.normalizer.regex.visitor.Visitor;
+
+/**
+ * Represents an escaped character in a regex pattern, e.g. {@code \-}.
+ */
+public class EscapedSingleCharNode extends Node {
+
+ public static final String PROPERTY_CHAR = "char";
+
+ public EscapedSingleCharNode() {}
+
+ public EscapedSingleCharNode(char character) {
+ setCharacter(character);
+ }
+
+ public EscapedSingleCharNode(Map properties) {
+ super(properties);
+ }
+
+ public char getCharacter() {
+ return getProperty(PROPERTY_CHAR).charAt(0);
+ }
+
+ public void setCharacter(char character) {
+ setProperty(PROPERTY_CHAR, String.valueOf(character));
+ }
+
+ @Override
+ public NodeType getType() {
+ return NodeType.ESCAPED_SINGLE_CHAR;
+ }
+
+ @Override
+ public Object accept(Visitor visitor, Object data) {
+ return visitor.visitEscapedSingleChar(this, data);
+ }
+
+ @Override
+ public EscapedSingleCharNode shallowCopy() {
+ return new EscapedSingleCharNode(this.properties);
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/ExpressionNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/ExpressionNode.java
new file mode 100644
index 00000000000..ec00add8f7b
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/ExpressionNode.java
@@ -0,0 +1,41 @@
+package datawave.data.normalizer.regex;
+
+import java.util.List;
+import java.util.Map;
+
+import datawave.data.normalizer.regex.visitor.Visitor;
+
+/**
+ * Represents some subset or the full part of a regex pattern.
+ */
+public class ExpressionNode extends Node {
+
+ public ExpressionNode() {}
+
+ public ExpressionNode(Node child) {
+ super(child);
+ }
+
+ public ExpressionNode(List extends Node> children) {
+ super(children);
+ }
+
+ public ExpressionNode(Map properties) {
+ super(properties);
+ }
+
+ @Override
+ public NodeType getType() {
+ return NodeType.EXPRESSION;
+ }
+
+ @Override
+ public Object accept(Visitor visitor, Object data) {
+ return visitor.visitExpression(this, data);
+ }
+
+ @Override
+ public ExpressionNode shallowCopy() {
+ return new ExpressionNode(this.properties);
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/GroupNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/GroupNode.java
new file mode 100644
index 00000000000..52d32c9bb02
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/GroupNode.java
@@ -0,0 +1,41 @@
+package datawave.data.normalizer.regex;
+
+import java.util.List;
+import java.util.Map;
+
+import datawave.data.normalizer.regex.visitor.Visitor;
+
+/**
+ * Represents a regex group in a regex pattern encapsulated by {@code (...)}.
+ */
+public class GroupNode extends Node {
+
+ public GroupNode() {}
+
+ public GroupNode(Node child) {
+ super(child);
+ }
+
+ public GroupNode(List extends Node> children) {
+ super(children);
+ }
+
+ public GroupNode(Map properties) {
+ super(properties);
+ }
+
+ @Override
+ public NodeType getType() {
+ return NodeType.GROUP;
+ }
+
+ @Override
+ public Object accept(Visitor visitor, Object data) {
+ return visitor.visitGroup(this, data);
+ }
+
+ @Override
+ public GroupNode shallowCopy() {
+ return new GroupNode(this.properties);
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/IntegerNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/IntegerNode.java
new file mode 100644
index 00000000000..2dbee0a54c6
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/IntegerNode.java
@@ -0,0 +1,48 @@
+package datawave.data.normalizer.regex;
+
+import java.util.Map;
+import java.util.Objects;
+import java.util.StringJoiner;
+
+import datawave.data.normalizer.regex.visitor.Visitor;
+
+/**
+ * Represents an integer parsed in a regex repetition that did not contain a range, e.g. {@code {3}}.
+ */
+public class IntegerNode extends Node {
+
+ public static final String PROPERTY_VALUE = "value";
+
+ public IntegerNode() {}
+
+ public IntegerNode(int value) {
+ setValue(value);
+ }
+
+ public IntegerNode(Map properties) {
+ super(properties);
+ }
+
+ public int getValue() {
+ return Integer.parseInt(getProperty(PROPERTY_VALUE));
+ }
+
+ public void setValue(int value) {
+ setProperty(PROPERTY_VALUE, String.valueOf(value));
+ }
+
+ @Override
+ public NodeType getType() {
+ return NodeType.INTEGER;
+ }
+
+ @Override
+ public Object accept(Visitor visitor, Object data) {
+ return visitor.visitInteger(this, data);
+ }
+
+ @Override
+ public IntegerNode shallowCopy() {
+ return new IntegerNode(properties);
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/IntegerRangeNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/IntegerRangeNode.java
new file mode 100644
index 00000000000..fa264b0ab46
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/IntegerRangeNode.java
@@ -0,0 +1,67 @@
+package datawave.data.normalizer.regex;
+
+import java.util.Map;
+import java.util.Objects;
+import java.util.StringJoiner;
+
+import datawave.data.normalizer.regex.visitor.Visitor;
+
+/**
+ * Represents an integer range parsed from a regex repetition that specified a range, e.g. {@code {3,}} or {@code {3,10}}.
+ */
+public class IntegerRangeNode extends Node {
+
+ public static final String PROPERTY_START = "start";
+ public static final String PROPERTY_END = "end";
+
+ public IntegerRangeNode() {}
+
+ public IntegerRangeNode(int start, Integer end) {
+ setStart(start);
+ setEnd(end);
+ }
+
+ public IntegerRangeNode(Map properties) {
+ super(properties);
+ }
+
+ public int getStart() {
+ return Integer.parseInt(getProperty(PROPERTY_START));
+ }
+
+ public void setStart(int start) {
+ setProperty(PROPERTY_START, String.valueOf(start));
+ }
+
+ public Integer getEnd() {
+ if (hasProperty(PROPERTY_END)) {
+ return Integer.valueOf(getProperty(PROPERTY_END));
+ }
+ return null;
+ }
+
+ public void setEnd(Integer end) {
+ if (end != null) {
+ setProperty(PROPERTY_END, String.valueOf(end));
+ }
+ }
+
+ public boolean isEndBounded() {
+ return hasProperty(PROPERTY_END);
+ }
+
+ @Override
+ public NodeType getType() {
+ return NodeType.INTEGER_RANGE;
+ }
+
+ @Override
+ public Object accept(Visitor visitor, Object data) {
+ return visitor.visitIntegerRange(this, data);
+ }
+
+ @Override
+ public IntegerRangeNode shallowCopy() {
+ return new IntegerRangeNode(this.properties);
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/Node.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/Node.java
new file mode 100644
index 00000000000..30d50417981
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/Node.java
@@ -0,0 +1,333 @@
+package datawave.data.normalizer.regex;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+
+import datawave.data.normalizer.regex.visitor.Visitor;
+
+public abstract class Node {
+
+ protected Node parent;
+ protected Map properties;
+ protected ArrayList children = new ArrayList<>();
+
+ protected Node() {}
+
+ @SuppressWarnings("CopyConstructorMissesField")
+ protected Node(Node child) {
+ addChild(child);
+ }
+
+ protected Node(Map properties) {
+ if (properties != null) {
+ this.properties = new HashMap<>();
+ this.properties.putAll(properties);
+ }
+ }
+
+ protected Node(Collection extends Node> children) {
+ addChildren(children);
+ }
+
+ /**
+ * Return the node type.
+ *
+ * @return the type
+ */
+ public abstract NodeType getType();
+
+ /**
+ * Return the parent of this {@link Node}. Possibly null if a parent was never set.
+ *
+ * @return the parent
+ */
+ public Node getParent() {
+ return parent;
+ }
+
+ /**
+ * Set the parent for this node.
+ *
+ * @param parent
+ * the parent
+ */
+ public void setParent(Node parent) {
+ this.parent = parent;
+ }
+
+ public boolean hasProperties() {
+ return properties != null;
+ }
+
+ public boolean hasProperty(String key) {
+ return hasProperties() && properties.containsKey(key);
+ }
+
+ public String getProperty(String key) {
+ return properties.get(key);
+ }
+
+ public void setProperty(String key, String value) {
+ if (properties == null) {
+ properties = new HashMap<>();
+ }
+ properties.put(key, value);
+ }
+
+ public void setProperties(Map properties) {
+ if (properties != null) {
+ if (this.properties == null) {
+ this.properties = new HashMap<>();
+ }
+ this.properties.putAll(properties);
+ }
+ }
+
+ /**
+ * Return the children of this {@link Node}. Possibly empty, but never null.
+ *
+ * @return the children
+ */
+ public List getChildren() {
+ return children;
+ }
+
+ /**
+ * Set the children for this {@link Node}. If the given list is null, the list of children for this node will be cleared.
+ *
+ * @param children
+ * the children
+ */
+ public void setChildren(Collection children) {
+ this.children.clear();
+ if (children != null) {
+ children.forEach(this::addChild);
+ }
+ }
+
+ /**
+ * Add a child to the end of the list of children for this node.
+ *
+ * @param child
+ * the child to add
+ */
+ public void addChild(Node child) {
+ this.children.add(child);
+ child.parent = this;
+ }
+
+ /**
+ * Add a child to this node at the specified index. Shifts the child at the specified index and any subsequent children to the right by one index.
+ *
+ * @param child
+ * the child to insert
+ * @param index
+ * the index at which the child is to be inserted
+ */
+ public void addChild(Node child, int index) {
+ this.children.add(index, child);
+ child.parent = this;
+ }
+
+ /**
+ * Add each node in the given list to the end of the list of children for this node.
+ *
+ * @param children
+ * the children to add
+ */
+ public void addChildren(Collection extends Node> children) {
+ children.forEach(this::addChild);
+ }
+
+ /**
+ * Return the child at the specified index in this node's list of children.
+ *
+ * @param index
+ * the index
+ * @return the child
+ */
+ public Node getChildAt(int index) {
+ return children.get(index);
+ }
+
+ /**
+ * Return the number of children this node has.
+ *
+ * @return the total number of children
+ */
+ public int getChildCount() {
+ return children.size();
+ }
+
+ /**
+ * Return whether this node has any children.
+ *
+ * @return true if this node has at least one child, or false otherwise
+ */
+ public boolean hasChildren() {
+ return !children.isEmpty();
+ }
+
+ /**
+ * Returns whether this node is a leaf, that is, whether it has no children.
+ *
+ * @return true if this node has no children, or false otherwise
+ */
+ public boolean isLeaf() {
+ return children.size() == 0;
+ }
+
+ /**
+ * Accepts the given visitor and passes itself to the appropriate method in the {@link Visitor} with the given data.
+ *
+ * @param visitor
+ * the visitor
+ * @param data
+ * the data
+ * @return the result from the visitor
+ */
+ public abstract Object accept(Visitor visitor, Object data);
+
+ /**
+ * Passes the visitor to each child in this node for the child to accept.
+ *
+ * @param visitor
+ * the visitor
+ * @param data
+ * the data
+ * @return the data
+ */
+ public Object childrenAccept(Visitor visitor, Object data) {
+ children.forEach((child) -> child.accept(visitor, data));
+ return data;
+ }
+
+ /**
+ * Return a shallow copy of the node of the same type with all relevant attributes except for the parent and children.
+ *
+ * @return the shallow copy
+ */
+ public abstract Node shallowCopy();
+
+ /**
+ * Return whether any child of this node an instance of a type not found in the given types.
+ *
+ * @param types
+ * the types
+ * @return true if any child of this node is a type not found in the given types, or false otherwise
+ */
+ public boolean isAnyChildNotOf(Set> types) {
+ return children.stream().map(Node::getClass).anyMatch((t) -> !types.contains(t));
+ }
+
+ /**
+ * Return whether any child of this node is an instance of the given type.
+ *
+ * @param type
+ * the type
+ * @return true if any child of this node is an instance of the given type, or false otherwise
+ */
+ public boolean isAnyChildOf(Class extends Node> type) {
+ return children.stream().anyMatch(type::isInstance);
+ }
+
+ /**
+ * Returns the index within this node of the first child of the specified type. If no child of the specified type exists in this node, -1 is returned.
+ *
+ * @param type
+ * the type
+ * @return the index of the first child of the specified type, or -1 if no child of the type is found
+ */
+ public int indexOf(Class extends Node> type) {
+ return indexOf(type, 0);
+ }
+
+ /**
+ * Returns the index within this node of the first child of the specified type, starting the search at the specified index. If no child of the specified
+ * type exists at or after position {@code fromIndex}, -1 is returned.
+ *
+ * @param type
+ * the type
+ * @param fromIndex
+ * the index to start the search from
+ * @return the index of the first child of the specified type that is greater than or equal to {@code fromIndex}, or -1 if no child of the type is found
+ */
+ public int indexOf(Class extends Node> type, int fromIndex) {
+ for (int i = fromIndex; i < children.size(); i++) {
+ if (type.isInstance(children.get(i))) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ /**
+ * Return the first child of this node, or null if this node has no children.
+ *
+ * @return the first node, possibly null
+ */
+ public Node getFirstChild() {
+ return children.isEmpty() ? null : children.get(0);
+ }
+
+ /**
+ * Return the last child of this node, or null if this node has no children.
+ *
+ * @return the last node, possibly null
+ */
+ public Node getLastChild() {
+ return children.isEmpty() ? null : children.get((children.size() - 1));
+ }
+
+ /**
+ * Removes the first child from this node.
+ *
+ * @throws IndexOutOfBoundsException
+ * if there are no children
+ */
+ public void removeFirstChild() {
+ children.remove(0);
+ }
+
+ /**
+ * Return a new {@link NodeListIterator} instance that will traverse over this node's children.
+ *
+ * @return a new iterator
+ */
+ public NodeListIterator getChildrenIterator() {
+ return new NodeListIterator(this.children);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+ Node node = (Node) o;
+ return Objects.equals(properties, node.properties);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(properties);
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append(getClass().getSimpleName());
+ if (properties != null) {
+ sb.append("(").append(properties).append(")");
+ }
+ return sb.toString();
+ }
+
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/NodeListIterator.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/NodeListIterator.java
new file mode 100644
index 00000000000..94980668530
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/NodeListIterator.java
@@ -0,0 +1,192 @@
+package datawave.data.normalizer.regex;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.NoSuchElementException;
+
+/**
+ * An iterator for traversing over a list of {@link Node} instances, with functionality for skipping over nodes that meet certain conditions.
+ */
+public class NodeListIterator {
+
+ /**
+ * The list.
+ */
+ private final List nodes;
+
+ /**
+ * The current index.
+ */
+ private int index;
+
+ public NodeListIterator(List nodes) {
+ this.nodes = nodes;
+ }
+
+ /**
+ * Return the current iterator index.
+ *
+ * @return the index
+ */
+ public int index() {
+ return index;
+ }
+
+ /**
+ * Set the current index for the iterator.
+ *
+ * @param index
+ * the index
+ */
+ public void setIndex(int index) {
+ this.index = index;
+ }
+
+ /**
+ * Return true if there are more nodes to return from the list.
+ *
+ * @return true if there are a next node to return
+ */
+ public boolean hasNext() {
+ return this.index < this.nodes.size();
+ }
+
+ /**
+ * Return the next node from the list.
+ *
+ * @return the next node
+ * @throws NoSuchElementException
+ * if there is no next node
+ */
+ public Node next() {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
+ return nodes.get(index++);
+ }
+
+ /**
+ * Return the next node from the list without modifying the current iterator index.
+ *
+ * @return the next node
+ * @throws NoSuchElementException
+ * if there is no next node
+ */
+ public Node peekNext() {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
+ return nodes.get((index));
+ }
+
+ /**
+ * Return whether the next node is an instance of the given type.
+ *
+ * @param type
+ * the type
+ * @return true if the next node is an instance of the type, or false otherwise
+ * @throws NoSuchElementException
+ * if there is no next node
+ */
+ public boolean isNextInstanceOf(Class extends Node> type) {
+ return type.isInstance(peekNext());
+ }
+
+ /**
+ * Return whether the next node is an instance of one of the given types.
+ *
+ * @param types
+ * the types
+ * @return true if the next node is an instance of one of the given types, or false otherwise
+ * @throws NoSuchElementException
+ * if there is no next node
+ */
+ public boolean isNextInstanceOfAny(Collection> types) {
+ Node previous = peekNext();
+ return types.stream().anyMatch((type) -> type.isInstance(previous));
+ }
+
+ /**
+ * Update the iterator so that the next call to {@link #next()} will return the first node is not a regex element that can match against the character '0',
+ * starting from the iterator's current position in the list. If no such node is found, the iterator will be moved to the end of the list,
+ * {@link #hasNext()} will return false and any call to {@link #next()} will result in a {@link NoSuchElementException}.
+ */
+ public void seekPastZeroMatchingElements() {
+ while (hasNext()) {
+ // Peek at the next node.
+ Node next = peekNext();
+ // We have a leading zero. Skip it.
+ if (RegexUtils.matchesZero(next)) {
+ // Explicitly call next so that we increment the iterator index.
+ next();
+ // Seek past a succeeding quantifier and question mark if present.
+ seekPastQuantifiers();
+ seekPastQuestionMarks();
+ } else {
+ return;
+ }
+ }
+ }
+
+ /**
+ * Update the iterator so that the next call to {@link #next()} will return the first node is not a regex element that can match only the character '0',
+ * starting from the iterator's current position in the list. If no such node is found, the iterator will be moved to the end of the list,
+ * {@link #hasNext()} will return false and any call to {@link #next()} will result in a {@link NoSuchElementException}.
+ */
+ public void seekPastZeroOnlyElements() {
+ while (hasNext()) {
+ // Peek at the next node.
+ Node next = peekNext();
+ // We have a leading zero. Skip it.
+ if (RegexUtils.matchesZeroOnly(next)) {
+ // Explicitly call next so that we increment the iterator index.
+ next();
+ // Seek past a succeeding quantifier and question mark if present.
+ seekPastQuantifiers();
+ seekPastQuestionMarks();
+ } else {
+ return;
+ }
+ }
+ }
+
+ /**
+ * Update the iterator so that the next call to {@link #next()} will return the first node that is not a {@link ZeroOrMoreNode}, {@link OneOrMoreNode}, or
+ * {@link RepetitionNode}, starting from the iterator's current position in the list. If no such node is found, the iterator will be moved to the end of the
+ * list, {@link #hasNext()} will return false and any call to {@link #next()} will result in a {@link NoSuchElementException}.
+ */
+ public void seekPastQuantifiers() {
+ while (isNextQuantifier()) {
+ next();
+ }
+ }
+
+ /**
+ * Update the iterator so that the next call to {@link #next()} will return the first node that is not an {@link QuestionMarkNode}, starting from the
+ * iterator's current position in the list. If no such node is found, the iterator will be moved to the end of the list, {@link #hasNext()} will return
+ * false and any call to {@link #next()} will result in a {@link NoSuchElementException}.
+ */
+ public void seekPastQuestionMarks() {
+ while (isNextQuestionMark()) {
+ next();
+ }
+ }
+
+ /**
+ * Return whether the next node in the list is a {@link ZeroOrMoreNode}, {@link OneOrMoreNode}, or a {@link RepetitionNode}.
+ *
+ * @return true if the next node in the list is a quantifier type, or false otherwise
+ */
+ public boolean isNextQuantifier() {
+ return hasNext() && isNextInstanceOfAny(RegexConstants.QUANTIFIER_TYPES);
+ }
+
+ /**
+ * Return whether the next node in the list is a {@link QuestionMarkNode}.
+ *
+ * @return true if the next node in the list is an {@link QuestionMarkNode}, or false otherwise
+ */
+ public boolean isNextQuestionMark() {
+ return hasNext() && isNextInstanceOf(QuestionMarkNode.class);
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/NodeType.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/NodeType.java
new file mode 100644
index 00000000000..f72670789b2
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/NodeType.java
@@ -0,0 +1,25 @@
+package datawave.data.normalizer.regex;
+
+public enum NodeType {
+
+ ALTERNATION,
+ ANY_CHAR,
+ CHAR_CLASS,
+ CHAR_RANGE,
+ DIGIT_CHAR_CLASS,
+ EMPTY,
+ END_ANCHOR,
+ ESCAPED_SINGLE_CHAR,
+ EXPRESSION,
+ GROUP,
+ INTEGER,
+ INTEGER_RANGE,
+ ONE_OR_MORE,
+ OPTIONAL,
+ REPETITION,
+ SINGLE_CHAR,
+ START_ANCHOR,
+ ZERO_OR_MORE,
+ ENCODED_NUMBER,
+ ENCODED_PATTERN
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/NumericRegexEncoder.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/NumericRegexEncoder.java
new file mode 100644
index 00000000000..4e0e885417c
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/NumericRegexEncoder.java
@@ -0,0 +1,462 @@
+package datawave.data.normalizer.regex;
+
+import java.util.function.Function;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.apache.log4j.Logger;
+
+import com.google.common.base.CharMatcher;
+
+import datawave.data.normalizer.ZeroRegexStatus;
+import datawave.data.normalizer.regex.visitor.AlternationDeduper;
+import datawave.data.normalizer.regex.visitor.AnchorTrimmer;
+import datawave.data.normalizer.regex.visitor.DecimalPointPlacer;
+import datawave.data.normalizer.regex.visitor.DecimalPointValidator;
+import datawave.data.normalizer.regex.visitor.EmptyLeafTrimmer;
+import datawave.data.normalizer.regex.visitor.ExponentialBinAdder;
+import datawave.data.normalizer.regex.visitor.NegativeNumberPatternInverter;
+import datawave.data.normalizer.regex.visitor.NegativeVariantExpander;
+import datawave.data.normalizer.regex.visitor.NonEncodedNumbersChecker;
+import datawave.data.normalizer.regex.visitor.NumericCharClassValidator;
+import datawave.data.normalizer.regex.visitor.OptionalVariantExpander;
+import datawave.data.normalizer.regex.visitor.PrintVisitor;
+import datawave.data.normalizer.regex.visitor.SimpleNumberEncoder;
+import datawave.data.normalizer.regex.visitor.StringVisitor;
+import datawave.data.normalizer.regex.visitor.ZeroLengthRepetitionTrimmer;
+import datawave.data.normalizer.regex.visitor.ZeroTrimmer;
+import datawave.data.normalizer.regex.visitor.ZeroValueNormalizer;
+import datawave.data.type.util.NumericalEncoder;
+
+/**
+ * This class handles provides functionality for encoding numeric regexes that are meant to match against numbers that were previously encoded via
+ * {@link NumericalEncoder#encode(String)}. It is expected that incoming regexes are initially written to match against base ten numbers. Due to the complex
+ * nature of how numbers are encoded and trimmed, accuracy is NOT guaranteed when using this class to encode numeric regexes.
+ *
+ *
+ * Requirements
+ *
+ * The following requirements apply to all incoming regexes:
+ *
+ * - Patterns may not be blank.
+ * - Patterns may not contain whitespace.
+ * - Patterns must be compilable.
+ * - Patterns may not contain any letters other than {@code "\d"}.
+ * - Patterns may not contain any escaped characters other than {@code "\."}, {@code "\-"}, or {@code "\d"}.
+ * - Patterns may not contain any groups, e.g. {@code "(45.*)"}.
+ * - Patterns may not contain any decimal points that are followed by {@code ?} {@code *} {@code +} or a repetition quantifier such as {@code {3}}.
+ *
+ *
+ *
+ * Supported Regex Features
+ *
+ * The following regex features are supported, with any noted caveats.
+ *
+ * - Wildcards {@code "."}.
+ * - Digit character class {@code "\d"}.
+ * - Character class lists {@code "[]"}. CAVEAT: Digit characters only. Ranges are supported.
+ * - Zero or more quantifier {@code "*"}.
+ * - One or more quantifier {@code "+"}.
+ * - Repetition quantifier {@code "{x}"}, {@code "{x,}"}, and {@code "{x,y}"}.
+ * - Anchors {@code "^"} and {@code "$"}. CAVEAT: Technically not truly supported as they are ultimately removed during the pre-optimization process. However,
+ * using them will not result in an error.
+ * - Alternations {@code "|"}.
+ *
+ * Additionally, in order to mark a regex pattern as intended to match negative numbers only, a minus sign should be placed at the beginning of the regex
+ * pattern, e.g. {@code "-34.*"}, or at the beginning of each desired alternated pattern.
+ *
+ *
+ * Optimizations
+ *
+ * Before encoding the incoming regex, it will undergo the following modifications to optimize the ease of encoding:
+ *
+ * - Any empty alternations will be removed.
+ * - Any occurrences of the anchors {@code ^} or {@code $} will be removed. These will need to be added back into the returned encoded regex pattern
+ * afterwards if desired.
+ * - Optional variants (characters followed by {@code ?}} will be expanded into additional alternations as seen. This will not apply to any {@code ?}
+ * instances that directly follow a {@code *}, {@code +}, or {@code {x}}, as the {@code ?} in this case modifies the greediness of the matching rather than
+ * whether or not a character can be present.
+ * - Any characters immediately followed by the repetition quantifier {@code "{0}"} or {@code "{0,0}"} will be removed as they are expected to occur zero
+ * times. This does not apply to characters with the repetition quantifier {@code "{0,}"} or a variation of {@code "{0,x}"}.
+ * - Any patterns starting with {@code ".*"} or {@code ".+"} will result in the addition of an alternation of the same pattern with a minus sign in front of
+ * it to ensure a variant for matching negative numbers is added. This does not apply to any regex patterns already starting with {@code "-.*"} or
+ * {@code "-.+"}.
+ * - In some cases a pattern may match both exactly zero and another number greater than one, e.g. the pattern "[0-9].*". In this case, an alternation for the
+ * character {@code "0"} will be added (i.e. {@code "[0-9].*|0"}) to ensure that the ability to match zero is not lost when enriching the pattern with the
+ * required exponential bins to target the appropriate encoded numbers.
+ * - Pattern alternations will be de-duped.
+ *
+ *
+ *
+ * A strong effort has been made to make resulting encoded patterns as accurate as possible, but there is always a chance of at least some inaccuracy, given the
+ * nature of how numbers are encoded, particularly when it comes to numbers that are very similar other than the location of a decimal point, if present, in
+ * them. If you find that the resulting encoded regex is not matching the desired encoding numbers, try to simplify it into a higher number of alternations with
+ * simpler regexes if possible.
+ *
+ * @see NumericalEncoder
+ */
+public class NumericRegexEncoder {
+
+ private static final Logger log = Logger.getLogger(NumericRegexEncoder.class);
+
+ /**
+ * Matches against any unescaped d characters, and any other letters. If \d is present, that indicates a digit and is allowed.
+ */
+ private static final Pattern RESTRICTED_LETTERS_PATTERN = Pattern.compile(".*[a-ce-zA-Z].*");
+
+ /**
+ * Matches any escaped character that is not \. \- or \d.
+ */
+ private static final Pattern RESTRICTED_ESCAPED_CHARS_PATTERN = Pattern.compile(".*\\\\[^.d\\-].*");
+
+ /**
+ * Matches any regex that consists only of anchors, hyphens (escaped or not), escaped periods, repetitions, the quantifier *, the quantifier +, optionals,
+ * alternations, and groups in any order with no alphanumeric characters that give any meaningful numeric information.
+ */
+ private static final Pattern NONSENSE_PATTERN = Pattern.compile("^\\^?(\\(*(\\\\\\.)*\\)*|(\\(*\\\\?[\\-*+?|])*\\)*|(\\{.*}))*\\$?$");
+
+ /**
+ * Matches any decimal points with ? + * or a repetition quantifier directly following them.
+ */
+ private static final Pattern INVALID_DECIMAL_POINTS_PATTERN = Pattern.compile(".*\\\\\\.[?+*{].*");
+
+ /**
+ * Matches against any variation of {@code .*}, {@code .+}, {@code .*?}, {@code .+?} that may or may not repeat, and that may or may not contain start
+ * and/or end anchors.
+ */
+ private static final Pattern NORMALIZATION_NOT_REQUIRED_PATTERN = Pattern.compile("^\\^?(\\.[*+]\\??)+\\$?$");
+
+ /**
+ * Encode the given numeric regex pattern such that it will match against encoded numbers.
+ *
+ * @param regex
+ * the regex pattern
+ * @return the encoded regex pattern
+ */
+ public static String encode(String regex) {
+ return new NumericRegexEncoder(regex).encode();
+ }
+
+ private final String pattern;
+ private Node patternTree;
+
+ private NumericRegexEncoder(String pattern) {
+ this.pattern = pattern;
+ }
+
+ public static ZeroRegexStatus getZeroRegexStatus(String regex) {
+ return ZeroTrimmer.getStatus(RegexParser.parse(regex).getChildren());
+ }
+
+ private String encode() {
+ if (log.isDebugEnabled()) {
+ log.debug("Encoding pattern " + pattern);
+ }
+
+ // Check the pattern for any quick failures.
+ checkPatternForQuickFailures();
+ // Encode the pattern only if it requires it.
+ if (isEncodingRequired()) {
+ parsePatternTree();
+ normalizePatternTree();
+ encodePatternTree();
+
+ if (log.isDebugEnabled()) {
+ log.debug("Encoded pattern '" + pattern + "' to '" + StringVisitor.toString(this.patternTree) + "'");
+ }
+
+ return StringVisitor.toString(this.patternTree);
+ } else {
+ if (log.isDebugEnabled()) {
+ log.debug("Encoding not required for pattern '" + pattern + "'");
+ }
+ return this.pattern;
+ }
+ }
+
+ /**
+ * Pre-validate the regex to quickly identify any indications that the regex is not valid for numerical expansion.
+ */
+ private void checkPatternForQuickFailures() {
+ checkForBlankPattern();
+ checkForWhitespace();
+ checkForCompilation();
+ checkForNonsense();
+ checkForRestrictedLetters();
+ checkForRestrictedEscapedCharacters();
+ checkForGroups();
+ checkForQuantifiedDecimalPoints();
+ }
+
+ /**
+ * Throws an exception if the regex pattern is blank.
+ */
+ private void checkForBlankPattern() {
+ if (this.pattern.isEmpty()) {
+ throw new IllegalArgumentException("Regex pattern may not be blank.");
+ }
+ }
+
+ /**
+ * Throws an exception if the regex contains any whitespace.
+ */
+ private void checkForWhitespace() {
+ if (CharMatcher.whitespace().matchesAnyOf(pattern)) {
+ throw new IllegalArgumentException("Regex pattern may not contain any whitespace.");
+ }
+ }
+
+ /**
+ * Throws an exception if the regex cannot be compiled.
+ */
+ private void checkForCompilation() {
+ try {
+ Pattern.compile(this.pattern);
+ } catch (PatternSyntaxException e) {
+ throw new IllegalArgumentException("Regex pattern will not compile.", e);
+ }
+ }
+
+ private void checkForNonsense() {
+ if (NONSENSE_PATTERN.matcher(this.pattern).matches()) {
+ throw new IllegalArgumentException("A nonsense pattern has been given that cannot be normalized.");
+ }
+ }
+
+ /**
+ * Throws an exception if the regex contains any letter other than an escaped lowercase d.
+ */
+ private void checkForRestrictedLetters() {
+ if (RESTRICTED_LETTERS_PATTERN.matcher(pattern).matches() || containsUnescapedLowercaseD()) {
+ throw new IllegalArgumentException(
+ "Regex pattern may not contain any letters other than \\d to indicate a member of the digit character class 0-9.");
+ }
+ }
+
+ /**
+ * Return whether the regex contains an unescaped d.
+ */
+ private boolean containsUnescapedLowercaseD() {
+ int pos = pattern.indexOf(RegexConstants.LOWERCASE_D);
+ while (pos != -1) {
+ if (pos == 0 || pattern.charAt(pos - 1) != RegexConstants.BACKSLASH) {
+ return true;
+ }
+ pos = pattern.indexOf(RegexConstants.LOWERCASE_D, pos + 1);
+ }
+ return false;
+ }
+
+ /**
+ * Throws an exception if the regex contains any escaped characters other than {@code \.}, {@code \-} or {@code \d}.
+ */
+ private void checkForRestrictedEscapedCharacters() {
+ if (RESTRICTED_ESCAPED_CHARS_PATTERN.matcher(this.pattern).matches()) {
+ throw new IllegalArgumentException("Regex pattern may not contain any escaped characters other than \\. \\- or \\d.");
+ }
+ }
+
+ /**
+ * Throws an exception if the regex contains any occurrences of '(' indicating the start of a group.
+ */
+ private void checkForGroups() {
+ if (this.pattern.contains("(")) {
+ throw new IllegalArgumentException("Regex pattern may not contain any groups.");
+ }
+ }
+
+ /**
+ * Throws an exception if the regex contains any decimal points directly followed by * + or {}.
+ */
+ private void checkForQuantifiedDecimalPoints() {
+ if (INVALID_DECIMAL_POINTS_PATTERN.matcher(this.pattern).matches()) {
+ throw new IllegalArgumentException("Regex pattern may not contain any decimal points that are directly followed by * ? or {}.");
+ }
+ }
+
+ /**
+ * Returns whether the regex requires normalization.
+ *
+ * @return true if the regex requires normalization, or false otherwise.
+ */
+ private boolean isEncodingRequired() {
+ return !NORMALIZATION_NOT_REQUIRED_PATTERN.matcher(this.pattern).matches();
+ }
+
+ /**
+ * Parse the regex to a node tree.
+ */
+ private void parsePatternTree() {
+ parsePatternToTree();
+ validateCharClasses();
+ validateDecimalPoints();
+ }
+
+ /**
+ * Normalize the pattern tree.
+ */
+ private void normalizePatternTree() {
+ trimAnchors();
+ trimZeroLengthRepetitions();
+ trimEmptyLeafs();
+ expandOptionalVariants();
+ expandNegativeVariants();
+ expandZeroValues();
+ }
+
+ /**
+ * Encode the pattern tree.
+ */
+ private void encodePatternTree() {
+ dedupe();
+ encodeSimpleNumbers();
+ // If there are no more unencoded sub-patterns in the tree after encoding simple numbers, no further work needs to be done.
+ if (!moreToEncode()) {
+ return;
+ }
+ addExponentialBins();
+ trimZeros();
+ invertNegativePatterns();
+ addDecimalPoints();
+ dedupe();
+ }
+
+ /**
+ * Parse the pattern to a node tree.
+ */
+ private void parsePatternToTree() {
+ this.patternTree = RegexParser.parse(this.pattern);
+
+ if (log.isDebugEnabled()) {
+ log.debug("Parsed pattern to tree structure:\n" + PrintVisitor.printToString(this.patternTree));
+ }
+ }
+
+ /**
+ * Verify that the regex pattern does not contain any character classes with characters other than digits or a period.
+ */
+ private void validateCharClasses() {
+ NumericCharClassValidator.validate(this.patternTree);
+
+ if (log.isDebugEnabled()) {
+ log.debug("Validated character classes in regex");
+ }
+ }
+
+ /**
+ * Verify that the regex pattern does not contain any alternated expressions that have more than one required decimal point.
+ */
+ private void validateDecimalPoints() {
+ DecimalPointValidator.validate(this.patternTree);
+
+ if (log.isDebugEnabled()) {
+ log.debug("Validated decimal points classes in regex");
+ }
+ }
+
+ /**
+ * Trim all anchors.
+ */
+ private void trimAnchors() {
+ updatePatternTree(AnchorTrimmer::trim, "trimming anchors");
+ }
+
+ /**
+ * Trim all elements that occur exactly zero times.
+ */
+ private void trimZeroLengthRepetitions() {
+ updatePatternTree(ZeroLengthRepetitionTrimmer::trim, "trimming zero-length repetition characters");
+
+ // If the pattern is empty afterwards, throw an exception.
+ if (this.patternTree == null) {
+ throw new IllegalArgumentException("Regex pattern is empty after trimming all characters followed by {0} or {0,0}.");
+ }
+ }
+
+ /**
+ * Trim the tree of any empty nodes and empty alternations, and verify if we still have a pattern to encode.
+ */
+ private void trimEmptyLeafs() {
+ updatePatternTree(EmptyLeafTrimmer::trim, "trimming empty leafs");
+ }
+
+ /**
+ * Expand optional variants.
+ */
+ private void expandOptionalVariants() {
+ updatePatternTree(OptionalVariantExpander::expand, "expanding optional variants");
+ }
+
+ /**
+ * Expand any patterns beginning with {@code .} to include a version with a minus sign in front of it.
+ */
+ private void expandNegativeVariants() {
+ updatePatternTree(NegativeVariantExpander::expand, "expanding negative variants");
+ }
+
+ /**
+ * If any patterns can match the number '0', add an alternation with '0'.
+ */
+ private void expandZeroValues() {
+ updatePatternTree(ZeroValueNormalizer::expand, "normalizing zero-value characters");
+ }
+
+ /**
+ * Remove any duplicate alternations.
+ */
+ private void dedupe() {
+ updatePatternTree(AlternationDeduper::dedupe, "de-duping");
+ }
+
+ /**
+ * Encode any and all simple numbers present in the pattern.
+ */
+ private void encodeSimpleNumbers() {
+ updatePatternTree(SimpleNumberEncoder::encode, "encoding simple numbers");
+ }
+
+ /**
+ * Return whether there are unencoded sub-patterns in the tree after encoding simple numbers.
+ *
+ * @return true if there are more patterns to encode, or false otherwise
+ */
+ private boolean moreToEncode() {
+ return NonEncodedNumbersChecker.check(this.patternTree);
+ }
+
+ /**
+ * Add exponential bin range information, e.g. \+[a-z], ![A-Z], etc.
+ */
+ private void addExponentialBins() {
+ updatePatternTree(ExponentialBinAdder::addBins, "adding exponential bin information");
+ }
+
+ /**
+ * Trim/consolidate any leading zeros in partially-encoded patterns.
+ */
+ private void trimZeros() {
+ updatePatternTree(ZeroTrimmer::trim, "trimming leading/trailing zeros");
+ }
+
+ /**
+ * Invert any patterns that are meant to match negative numbers.
+ */
+ private void invertNegativePatterns() {
+ updatePatternTree(NegativeNumberPatternInverter::invert, "inverting patterns for negative numbers");
+ }
+
+ /**
+ * Add decimal points where required.
+ */
+ private void addDecimalPoints() {
+ updatePatternTree(DecimalPointPlacer::addDecimalPoints, "adding decimal points");
+ }
+
+ private void updatePatternTree(Function function, String operationDescription) {
+ this.patternTree = function.apply(this.patternTree);
+
+ if (log.isDebugEnabled()) {
+ log.debug("Regex after " + operationDescription + ": " + StringVisitor.toString(this.patternTree));
+ }
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/OneOrMoreNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/OneOrMoreNode.java
new file mode 100644
index 00000000000..1876642b856
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/OneOrMoreNode.java
@@ -0,0 +1,32 @@
+package datawave.data.normalizer.regex;
+
+import java.util.Map;
+
+import datawave.data.normalizer.regex.visitor.Visitor;
+
+/**
+ * Represents the plus sign in a regex pattern.
+ */
+public class OneOrMoreNode extends Node {
+
+ public OneOrMoreNode() {}
+
+ public OneOrMoreNode(Map properties) {
+ super(properties);
+ }
+
+ @Override
+ public NodeType getType() {
+ return NodeType.ONE_OR_MORE;
+ }
+
+ @Override
+ public Object accept(Visitor visitor, Object data) {
+ return visitor.visitOneToMany(this, data);
+ }
+
+ @Override
+ public OneOrMoreNode shallowCopy() {
+ return new OneOrMoreNode(this.properties);
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/QuestionMarkNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/QuestionMarkNode.java
new file mode 100644
index 00000000000..0dd34dbefa7
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/QuestionMarkNode.java
@@ -0,0 +1,32 @@
+package datawave.data.normalizer.regex;
+
+import java.util.Map;
+
+import datawave.data.normalizer.regex.visitor.Visitor;
+
+/**
+ * Represents the question mark in a regex pattern.
+ */
+public class QuestionMarkNode extends Node {
+
+ public QuestionMarkNode() {}
+
+ public QuestionMarkNode(Map properties) {
+ super(properties);
+ }
+
+ @Override
+ public NodeType getType() {
+ return NodeType.OPTIONAL;
+ }
+
+ @Override
+ public Object accept(Visitor visitor, Object data) {
+ return visitor.visitQuestionMark(this, data);
+ }
+
+ @Override
+ public QuestionMarkNode shallowCopy() {
+ return new QuestionMarkNode(this.properties);
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexConstants.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexConstants.java
new file mode 100644
index 00000000000..df1ef9ee1c2
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexConstants.java
@@ -0,0 +1,65 @@
+package datawave.data.normalizer.regex;
+
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableSet;
+
+public class RegexConstants {
+
+ public static final char ZERO = '0';
+ public static final char ONE = '1';
+ public static final char TWO = '2';
+ public static final char THREE = '3';
+ public static final char FOUR = '4';
+ public static final char FIVE = '5';
+ public static final char SIX = '6';
+ public static final char SEVEN = '7';
+ public static final char EIGHT = '8';
+ public static final char NINE = '9';
+ public static final char LOWERCASE_D = 'd';
+ public static final char BACKSLASH = '\\';
+ public static final char PERIOD = '.';
+ public static final char HYPHEN = '-';
+ public static final char STAR = '*';
+ public static final char PLUS = '+';
+ public static final char PIPE = '|';
+ public static final char LEFT_PAREN = '(';
+ public static final char RIGHT_PAREN = ')';
+ public static final char LEFT_BRACKET = '[';
+ public static final char RIGHT_BRACKET = ']';
+ public static final char EXCLAMATION_POINT = '!';
+ public static final char LEFT_BRACE = '{';
+ public static final char RIGHT_BRACE = '}';
+ public static final char QUESTION_MARK = '?';
+ public static final char COMMA = ',';
+ public static final char CARET = '^';
+ public static final char DOLLAR_SIGN = '$';
+ public static final char CAPITAL_E = 'E';
+
+ public static final String ESCAPED_BACKSLASH = "\\\\";
+
+ /**
+ * Use base 10 when parsing characters to ints.
+ */
+ public static final int DECIMAL_RADIX = 10;
+
+ /**
+ * The set of all digits. This reflects all possible permutations for any \d found in the regex.
+ */
+ public static final List ALL_DIGITS = ImmutableList.of(ZERO, ONE, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE);
+
+ public static final Set> QUANTIFIER_TYPES = ImmutableSet.of(ZeroOrMoreNode.class, OneOrMoreNode.class, RepetitionNode.class);
+
+ public static final Set> SIMPLE_NUMBER_TYPES = ImmutableSet.of(SingleCharNode.class, EscapedSingleCharNode.class,
+ StartAnchorNode.class, EndAnchorNode.class);
+
+ public static final Pattern SIMPLE_NUMBER_REGEX_PATTERN = Pattern.compile("^\\^?(\\\\?-)?\\d*(\\\\\\.)?\\d+\\$?$");
+
+ private RegexConstants() {
+ throw new UnsupportedOperationException();
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexParser.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexParser.java
new file mode 100644
index 00000000000..b1077b1d45b
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexParser.java
@@ -0,0 +1,305 @@
+package datawave.data.normalizer.regex;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.lang3.StringUtils;
+
+/**
+ * This parser will create a {@link Node} tree parsed from a regex pattern. This parser will be used for normalizing numeric regex patterns, and as such is not
+ * intended to be a fully comprehensive regex parser. Some native regex characters may be restricted.
+ */
+public class RegexParser {
+
+ /**
+ * Parses the given regex and returns a {@link ExpressionNode} tree representing the parsed regex. If the string is null, null will be returned.
+ *
+ * @param regex
+ * the regex to parse
+ * @return the {@link Node} tree
+ */
+ public static ExpressionNode parse(String regex) {
+ if (regex == null) {
+ return null;
+ }
+ Node node = parseAlternations(regex);
+ // Ensure the root node is always an expression node.
+ return node instanceof ExpressionNode ? (ExpressionNode) node : createExpressionWithChild(node);
+ }
+
+ /**
+ * Parses a regex expression from the given string that may contain alternations. Depending on the expression, one of the following will be returned:
+ *
+ * - An {@link EmptyNode} will be returned if a blank string is given.
+ * - If the expression contains top-level alternations, an {@link ExpressionNode} with an {@link AlternationNode} as its child with its child alternating
+ * expressions will be returned.
+ * - If the expression does not contain any top-level alternations, an {@link ExpressionNode} with the parsed expression as its children will be
+ * returned.
+ *
+ *
+ * @param string
+ * the string
+ * @return the parsed node
+ */
+ private static Node parseAlternations(String string) {
+ // If the string is blank, return an EmptyNode.
+ if (StringUtils.isBlank(string)) {
+ return new EmptyNode();
+ }
+
+ List expressions = RegexUtils.splitOnAlternations(string);
+ Node node;
+ if (expressions.size() > 1) {
+ // If we have more than one expression, we must make the parsed expressions children of an alternation node.
+ node = new AlternationNode();
+ for (String segment : expressions) {
+ Node child = parseAlternations(segment);
+ if (child != null) {
+ node.addChild(child);
+ }
+ }
+ } else if (expressions.size() == 1) {
+ node = parseExpression(expressions.get(0));
+ } else {
+ return null;
+ }
+ // If the parsed node is not an AlternationNode, GroupNode, or ExpressionNode, wrap it in an ExpressionNode.
+ return requiresWrap(node) ? createExpressionWithChild(node) : node;
+ }
+
+ /**
+ * Parses a subset of a regex expression that does not contain any top-level alternations, i.e. pipes.
+ *
+ * @param string
+ * the regex to parse
+ * @return the parsed node
+ */
+ private static Node parseExpression(String string) {
+
+ // If the string is blank, return an EmptyNode.
+ if (StringUtils.isBlank(string)) {
+ return new EmptyNode();
+ }
+
+ List nodes = new ArrayList<>();
+ RegexReader reader = new RegexReader(string);
+ while (reader.hasNext()) {
+ reader.captureNext();
+ RegexReader.ExpressionType type = reader.capturedType();
+ String content = reader.capturedExpression();
+ nodes.add(createNode(type, content));
+ }
+
+ // If we have a single child parsed from the expression, wrap it in an expression node if it is not already a wrapper node. Otherwise, return the child.
+ if (nodes.size() == 1) {
+ Node child = nodes.get(0);
+ return requiresWrap(child) ? createExpressionWithChild(child) : child;
+ } else {
+ // Wrap the children in an expression node.
+ ExpressionNode expressionNode = new ExpressionNode();
+ expressionNode.setChildren(nodes);
+ return expressionNode;
+ }
+ }
+
+ /**
+ * Return a new {@link ExpressionNode} with the given node as its child.
+ *
+ * @param child
+ * the child
+ * @return the new node
+ */
+ private static ExpressionNode createExpressionWithChild(Node child) {
+ ExpressionNode node = new ExpressionNode();
+ node.addChild(child);
+ return node;
+ }
+
+ /**
+ * Return whether the given node should be wrapped in an {@link ExpressionNode}. A node should not be wrapped if it is an instance of one of the following:
+ *
+ * - {@link ExpressionNode}
+ * - {@link GroupNode}
+ * - {@link AlternationNode}
+ *
+ *
+ * @param node
+ * the node
+ * @return true if the given node is a wrapper type, or false otherwise.
+ */
+ private static boolean requiresWrap(Node node) {
+ return node != null && !(node instanceof ExpressionNode || node instanceof AlternationNode || node instanceof GroupNode);
+ }
+
+ /**
+ * Return a new node of the specified type with the given content if applicable.
+ *
+ * @param type
+ * the node type to create
+ * @param content
+ * the content
+ * @return the new node
+ */
+ private static Node createNode(RegexReader.ExpressionType type, String content) {
+ switch (type) {
+ case ANCHOR_START:
+ return new StartAnchorNode();
+ case ANCHOR_END:
+ return new EndAnchorNode();
+ case ESCAPED_CHAR:
+ return createNodeFromEscapedChar(content);
+ case ANY_CHAR:
+ return new AnyCharNode();
+ case ZERO_OR_MORE:
+ return new ZeroOrMoreNode();
+ case ONE_OR_MORE:
+ return new OneOrMoreNode();
+ case QUESTION_MARK:
+ return new QuestionMarkNode();
+ case SINGLE_CHAR:
+ return new SingleCharNode(content.charAt(0));
+ case REPETITION:
+ return createRepetitionNode(content);
+ case CHAR_CLASS:
+ return createCharClassNode(content);
+ case GROUP:
+ return createGroupNode(content);
+ default:
+ throw new IllegalArgumentException("Unable to create new node of type " + type);
+ }
+ }
+
+ /**
+ * Return a new {@link Node} from the given escaped character. In the case of {@code \d}, a new {@link DigitCharClassNode} will be returned. Otherwise, a
+ * new {@link EscapedSingleCharNode} with the character will be returned.
+ *
+ * @param content
+ * the content
+ * @return the new node
+ */
+ private static Node createNodeFromEscapedChar(String content) {
+ char character = content.charAt(1);
+ if (character == RegexConstants.LOWERCASE_D) {
+ return new DigitCharClassNode();
+ }
+ return new EscapedSingleCharNode(character);
+ }
+
+ /**
+ * Return a new {@link RepetitionNode} parsed from the given expression. It is expected that the given content is an interval expression in the form
+ * {@code {x}}, {@code {x,y}}, {@code {x,}}, or {@code {,y}}.
+ *
+ * @param expression
+ * the interval expression
+ * @return the node
+ */
+ private static RepetitionNode createRepetitionNode(String expression) {
+ RepetitionNode node = new RepetitionNode();
+ int commaIndex = expression.indexOf(RegexConstants.COMMA);
+ if (commaIndex == -1) {
+ // If no comma is present, the interval expression is in the form {x}. Remove the curly braces and parse the number from x.
+ node.addChild(new IntegerNode(Integer.parseInt(trimFirstAndLastChar(expression))));
+ } else {
+ // If a comma is present, the interval expression is in the form {x,y} or {x,}. Remove the curly braces and parse the range from x and y.
+ int start = Integer.parseInt(expression.substring(1, commaIndex));
+ Integer end = commaIndex == (expression.length() - 2) ? null : Integer.parseInt(expression.substring((commaIndex + 1), (expression.length() - 1)));
+ node.addChild(new IntegerRangeNode(start, end));
+ }
+ return node;
+ }
+
+ /**
+ * Return a new {@link CharClassNode} parsed from the given expression. Parsing negated character classes is supported. The character class may only contain
+ * the following: digits, a period, a hyphen, a numerical range.
+ *
+ * @param expression
+ * the character class expression
+ * @return the node
+ */
+ private static CharClassNode createCharClassNode(String expression) {
+ CharClassNode node = new CharClassNode();
+ char[] chars = expression.toCharArray();
+ char next;
+ for (int pos = 1; pos < (chars.length - 1); pos++) {
+ char current = chars[pos];
+ switch (current) {
+ case RegexConstants.HYPHEN:
+ // We found a hyphen at the start or end of the character class, e.g. [-123] or [123-]. Hyphens do not need to be escaped in these cases.
+ node.addChild(new SingleCharNode(current));
+ break;
+ case RegexConstants.BACKSLASH:
+ // We found an escaped character.
+ next = chars[(pos) + 1];
+ node.addChild(new EscapedSingleCharNode(next));
+ pos++;
+ break;
+ case RegexConstants.CARET:
+ // If the caret is the first character in the class, we have a negated character class, e.g. [^123].
+ if (pos == 1) {
+ node.negate();
+ } else {
+ // Otherwise add it as a single character.
+ node.addChild(new SingleCharNode(current));
+ }
+ break;
+ default:
+ // Check if we have a non-trailing hyphen that indicates a defined character range.
+ next = chars[(pos + 1)];
+ if (next == RegexConstants.HYPHEN) {
+ char charAfterNext = chars[(pos) + 2];
+ // If the next character is not a closing bracket, we have a character range. Otherwise, the hyphen will need to be captured as its own
+ // single character in an earlier switch case,
+ if (charAfterNext != RegexConstants.RIGHT_BRACKET) {
+ node.addChild(new CharRangeNode(current, charAfterNext));
+ // Move to the next character after the range.
+ pos = pos + 2;
+ }
+ } else {
+ // Otherwise, add the current character as a single character.
+ node.addChild(new SingleCharNode(current));
+ }
+ break;
+ }
+ }
+ return node;
+ }
+
+ /**
+ * Return a new {@link GroupNode} parsed from the given expression.
+ *
+ * @param expression
+ * the group expression
+ * @return the node
+ */
+ private static GroupNode createGroupNode(String expression) {
+ String subExpression = trimFirstAndLastChar(expression);
+ GroupNode groupNode = new GroupNode();
+ Node node = parseAlternations(subExpression);
+ if (node != null) {
+ groupNode.addChild(node);
+ }
+ return groupNode;
+ }
+
+ /**
+ * Return the given string with the first and last character trimmed. If the string has a length less than 3, an empty string will be returned.
+ *
+ * @param str
+ * the string
+ * @return the trimmed string
+ */
+ private static String trimFirstAndLastChar(String str) {
+ if (str.length() < 3) {
+ return "";
+ } else {
+ return str.substring(1, (str.length() - 1));
+ }
+ }
+
+ /**
+ * Do not allow this class to be instantiated.
+ */
+ private RegexParser() {
+ throw new UnsupportedOperationException();
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexReader.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexReader.java
new file mode 100644
index 00000000000..b3b20df7af1
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexReader.java
@@ -0,0 +1,246 @@
+package datawave.data.normalizer.regex;
+
+import java.util.Arrays;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * A reader that traverses over a regex pattern and both identifies and steps through individual regex elements.
+ */
+class RegexReader {
+
+ public enum ExpressionType {
+ GROUP, ALTERNATION, REPETITION, CHAR_CLASS, SINGLE_CHAR, ESCAPED_CHAR, ANY_CHAR, ZERO_OR_MORE, ONE_OR_MORE, QUESTION_MARK, ANCHOR_START, ANCHOR_END
+ }
+
+ /**
+ * The original char array of the pattern.
+ */
+ private final char[] pattern;
+
+ /**
+ * Index into the pattern array that keeps track of how much has been read.
+ */
+ private int cursor = 0;
+
+ /**
+ * The type of the most recently read regex expression.
+ */
+ private ExpressionType capturedType;
+
+ /**
+ * The content of the most recently read regex expression.
+ */
+ private String capturedContent;
+
+ /**
+ * Create a new {@link RegexReader} that will read over the given regex pattern.
+ *
+ * @param pattern
+ * the regex pattern to read over
+ */
+ public RegexReader(String pattern) {
+ Preconditions.checkNotNull(pattern, "regex must not be null");
+ this.pattern = pattern.toCharArray();
+ }
+
+ /**
+ * Return whether there is another expression to capture in this reader.
+ *
+ * @return true if there is another expression, or false otherwise
+ */
+ public boolean hasNext() {
+ return cursor < pattern.length;
+ }
+
+ /**
+ * Return the {@link ExpressionType} identified for the next expression during the last call to {@link #captureNext()}, or null if {@link #captureNext()}
+ * has never been called.
+ *
+ * @return the captured type
+ */
+ public ExpressionType capturedType() {
+ return capturedType;
+ }
+
+ /**
+ * Return the string content identified for the next expression during the last call to {@link #captureNext()}, or null if {@link #captureNext()} has never
+ * been called.
+ *
+ * @return the captured string expression
+ */
+ public String capturedExpression() {
+ return capturedContent;
+ }
+
+ /**
+ * Identify and capture the regex node type and content of the next expression in this reader.
+ *
+ * @throws IllegalStateException
+ * if {@link #hasNext()} returns false
+ */
+ public void captureNext() {
+ if (hasNext()) {
+ identifyCurrentType();
+ int startOfCapture = cursor;
+ skipPastCurrentExpression();
+ this.capturedContent = new String(Arrays.copyOfRange(pattern, startOfCapture, cursor));
+ } else {
+ throw new IllegalStateException("Reader does not have next to capture");
+ }
+ }
+
+ /**
+ * Identify the type of the current expression starting at the current cursor point.
+ */
+ private void identifyCurrentType() {
+ char current = current();
+ switch (current) {
+ case RegexConstants.PIPE:
+ this.capturedType = ExpressionType.ALTERNATION;
+ break;
+ case RegexConstants.LEFT_PAREN:
+ this.capturedType = ExpressionType.GROUP;
+ break;
+ case RegexConstants.LEFT_BRACE:
+ this.capturedType = ExpressionType.REPETITION;
+ break;
+ case RegexConstants.LEFT_BRACKET:
+ this.capturedType = ExpressionType.CHAR_CLASS;
+ break;
+ case RegexConstants.CARET:
+ this.capturedType = ExpressionType.ANCHOR_START;
+ break;
+ case RegexConstants.DOLLAR_SIGN:
+ this.capturedType = ExpressionType.ANCHOR_END;
+ break;
+ case RegexConstants.PERIOD:
+ this.capturedType = ExpressionType.ANY_CHAR;
+ break;
+ case RegexConstants.STAR:
+ this.capturedType = ExpressionType.ZERO_OR_MORE;
+ break;
+ case RegexConstants.PLUS:
+ this.capturedType = ExpressionType.ONE_OR_MORE;
+ break;
+ case RegexConstants.QUESTION_MARK:
+ this.capturedType = ExpressionType.QUESTION_MARK;
+ break;
+ case RegexConstants.BACKSLASH:
+ this.capturedType = ExpressionType.ESCAPED_CHAR;
+ break;
+ default:
+ this.capturedType = ExpressionType.SINGLE_CHAR;
+ }
+ }
+
+ /**
+ * Return the character in the chars array at the current cursor index.
+ *
+ * @return the current character
+ */
+ private char current() {
+ return pattern[cursor];
+ }
+
+ /**
+ * Increments the cursor by one and returns the next character in the char array.
+ *
+ * @return the next character
+ */
+ private char next() {
+ return pattern[++cursor];
+ }
+
+ /**
+ * Increment the cursor by one.
+ */
+ private void skip() {
+ cursor++;
+ }
+
+ /**
+ * Increment the cursor by the given number of skips.
+ *
+ * @param skips
+ * the skips to increment by
+ */
+ private void skip(int skips) {
+ cursor = cursor + skips;
+ }
+
+ /**
+ * Increment the cursor to point to the position after the current expression based on the current captured type.
+ */
+ private void skipPastCurrentExpression() {
+ switch (capturedType) {
+ case SINGLE_CHAR:
+ case ALTERNATION:
+ case ANY_CHAR:
+ case ZERO_OR_MORE:
+ case ONE_OR_MORE:
+ case QUESTION_MARK:
+ case ANCHOR_START:
+ case ANCHOR_END:
+ skip(1);
+ break;
+ case ESCAPED_CHAR:
+ skip(2);
+ break;
+ case CHAR_CLASS:
+ skipPastChar(RegexConstants.RIGHT_BRACKET);
+ break;
+ case REPETITION:
+ skipPastChar(RegexConstants.RIGHT_BRACE);
+ break;
+ case GROUP:
+ skipPastGroup();
+ break;
+ default:
+ throw new IllegalArgumentException("Unable to seek past type " + capturedType);
+ }
+ }
+
+ /**
+ * Increment the cursor to point to the position after the first occurrence of the given character.
+ *
+ * @param character
+ * the character to skip past
+ */
+ private void skipPastChar(char character) {
+ while (hasNext()) {
+ char next = next();
+ if (next == character) {
+ skip();
+ return;
+ }
+ }
+ }
+
+ /**
+ * Increment the cursor to point to the position after the current group expression. This method will handle nested groups.
+ */
+ private void skipPastGroup() {
+ int nestedGroups = 0;
+ while (hasNext()) {
+ char next = next();
+ switch (next) {
+ case RegexConstants.RIGHT_PAREN:
+ // If there are no nested groups, we've found the end of the target group. Skip ahead to the next character after it.
+ if (nestedGroups == 0) {
+ skip();
+ return;
+ } else {
+ // We've traversed to the end of a nested group.
+ nestedGroups--;
+ }
+ break;
+ case RegexConstants.LEFT_PAREN:
+ // If we encounter a ( before the first ) we see, we've found a nested group and must traverse to the end it.
+ nestedGroups++;
+ break;
+ default:
+ }
+ }
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexUtils.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexUtils.java
new file mode 100644
index 00000000000..2fdd487e3d5
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RegexUtils.java
@@ -0,0 +1,639 @@
+package datawave.data.normalizer.regex;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import datawave.data.normalizer.regex.visitor.StringVisitor;
+import datawave.data.type.util.NumericalEncoder;
+
+public class RegexUtils {
+
+ /**
+ * Split the given string by all top-level alternations into individual regex segments to be further evaluated. Any pipes encapsulated within groups, e.g.
+ * (1|2|3) will not count as alternations to split. See the following input examples:
+ *
+ * - Input {@code ""} will return the list {@code {""}}
+ * - Input {@code "234.*"} will return the list {@code {"234.*"}}
+ * - Input {@code "234.*|45|653.*"} will return the list {@code {"234.*", "45", "653.*"}}
+ * - Input {@code "234.*|45|(3[34].*|4[54]3)} will return the list {@code {"234.*", "45", "(2[34].*|4[54]3)"}}
+ * - Input {@code "|34"} will return the list {@code {"", "34"}}}
+ * - Input {@code "34|"} will return the list {@code {"34", ""}}}
+ * - Input {@code "||"} will return the list {@code {"", "", ""}}}
+ * - Input {@code "|12||4|34|} will return the list {@code {"", "12", "", "4", "34"}}
+ *
+ *
+ * @param str
+ * the string to split
+ * @return the split segments
+ */
+ public static List splitOnAlternations(String str) {
+ List segments = new ArrayList<>();
+ // If the string is empty, return a list containing an empty string.
+ if (str.isEmpty()) {
+ segments.add("");
+ return segments;
+ }
+
+ char[] chars = str.toCharArray();
+ int strLength = chars.length;
+ int lastPos = strLength - 1;
+ int groupsToTraverse = 0;
+ int startOfSegment = 0;
+ // Stream over the string one character at a time.
+ for (int pos = 0; pos < strLength; pos++) {
+ char current = chars[pos];
+ if (pos != lastPos) {
+ switch (current) {
+ case RegexConstants.PIPE:
+ // If we found an alternation, it is top-level only if there are no groups we currently need to traverse.
+ if (groupsToTraverse == 0) {
+ // If the start of the segment is not the current position, we have a non-zero length segment.
+ if (startOfSegment != pos) {
+ segments.add(str.substring(startOfSegment, (pos)));
+ } else {
+ // Otherwise we've encountered an empty alternation somewhere before the end of the string.
+ segments.add("");
+ }
+ // Mark the start of the next segment as the next character.
+ startOfSegment = pos + 1;
+ }
+ break;
+ case RegexConstants.LEFT_PAREN:
+ // We found the start of a group. Increment the number of groups we need to traverse.
+ groupsToTraverse++;
+ break;
+ case RegexConstants.RIGHT_PAREN:
+ // We found the end of a group. Decrement the number of groups we need to traverse.
+ groupsToTraverse--;
+ break;
+ default:
+ }
+ } else {
+ // If the last character is not a pipe, it is part of the last segment.
+ if (current != RegexConstants.PIPE) {
+ segments.add(str.substring(startOfSegment));
+ } else {
+ // If we have a zero-length segment, add an empty alternation.
+ if (startOfSegment == pos) {
+ segments.add("");
+ } else {
+ // Otherwise the segment ends at the character before last.
+ segments.add(str.substring(startOfSegment, lastPos));
+ }
+ // Add a trailing empty segment.
+ segments.add("");
+ }
+ }
+ }
+ return segments;
+ }
+
+ /**
+ * Return whether the regex consists of a single simple number without any special operations, e.g. '1', '1\\.0', '-1', '-1\\.0'.
+ */
+ public static boolean isNumber(String str) {
+ char[] chars = str.toCharArray();
+ int lastPos = chars.length - 1;
+ for (int currentPos = 0; currentPos <= lastPos; currentPos++) {
+ char current = chars[currentPos];
+ switch (current) {
+ case RegexConstants.BACKSLASH:
+ case RegexConstants.HYPHEN:
+ case RegexConstants.ONE:
+ case RegexConstants.TWO:
+ case RegexConstants.THREE:
+ case RegexConstants.FOUR:
+ case RegexConstants.FIVE:
+ case RegexConstants.SIX:
+ case RegexConstants.SEVEN:
+ case RegexConstants.EIGHT:
+ case RegexConstants.NINE:
+ continue;
+ case RegexConstants.PERIOD:
+ // If we encounter a period at the beginning of the regex, we know it is a dot wildcard and not an escaped decimal point.
+ if (currentPos == 0) {
+ return false;
+ } else {
+ // If we encounter a period anywhere else in the regex, if it is not preceded by a backslash to indicate that it's an escaped decimal
+ // point, then it is a dot wildcard.
+ char prev = chars[(currentPos - 1)];
+ if (prev != RegexConstants.BACKSLASH) {
+ return false;
+ }
+ }
+ break;
+ default:
+ // Any characters other than 0-9, -, or \. indicate a non-simple number regex.
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Returns the escaped, encoded form of a string containing a number from part of a regex. The string must be a number, and may be escaped. See the
+ * following input examples:
+ *
+ * - Input {@code "1.2"} will return {@code "\+aE1\.2"}
+ * - Input {@code "1\.2"} will return {@code "\+aE1\.2"}
+ * - Input {@code "12"} will return {@code "\+bE1\.2"}
+ * - Input {@code "-1\.2"} will return {@code "\!ZE1\.2"}
+ * - Input {@code "-12"} will return {@code "\!YE1\.2"}
+ *
+ *
+ * @param str
+ * the string to encode
+ * @return the escaped, encoded number
+ */
+ public static String encodeNumber(String str) {
+ return escapeEncodedNumber(NumericalEncoder.encode(removeBackslashes(str)));
+ }
+
+ /**
+ * Return the given string with all backslashes removed from it.
+ *
+ * @param str
+ * the string
+ * @return the string without any backslashes
+ */
+ public static String removeBackslashes(String str) {
+ return str.replaceAll(RegexConstants.ESCAPED_BACKSLASH, "");
+ }
+
+ /**
+ * Return an encoded whole number with the characters {@code . ! +} escaped by a backslash.
+ */
+ public static String escapeEncodedNumber(String str) {
+ StringBuilder sb = new StringBuilder();
+ for (char current : str.toCharArray()) {
+ if (current == RegexConstants.PERIOD || current == RegexConstants.PLUS) {
+ sb.append(RegexConstants.BACKSLASH);
+ }
+ sb.append(current);
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Return the index of the first escaped period present in the children of the given node tree, or -1 if no such child is found.
+ *
+ * @param node
+ * the node
+ * @return the index of the first escaped period, or -1 if not found
+ */
+ public static int getDecimalPointIndex(Node node) {
+ int index = node.indexOf(EscapedSingleCharNode.class);
+ while (index != -1) {
+ EscapedSingleCharNode escapedNode = (EscapedSingleCharNode) node.getChildAt(index);
+ if (escapedNode.getCharacter() == RegexConstants.PERIOD) {
+ return index;
+ }
+ index = node.indexOf(EscapedSingleCharNode.class, (index + 1));
+ }
+ return -1;
+ }
+
+ /**
+ * Returns whether the first child in the given node tree is a minus sign.
+ *
+ * @param node
+ * the node
+ * @return true if the first child is a minus sign, or false otherwise
+ */
+ public static boolean isNegativeRegex(Node node) {
+ return isChar(node.getFirstChild(), RegexConstants.HYPHEN);
+ }
+
+ /**
+ * Return whether the given node is an escaped period.
+ *
+ * @param node
+ * the node
+ * @return true if the given node is an escaped period, or false otherwise.
+ */
+ public static boolean isDecimalPoint(Node node) {
+ return node instanceof EscapedSingleCharNode && ((EscapedSingleCharNode) node).getCharacter() == RegexConstants.PERIOD;
+ }
+
+ /**
+ * Return whether the given node is the given character, escaped or otherwise.
+ *
+ * @param node
+ * the node
+ * @param character
+ * the character
+ * @return true if the given node is the given character, or false otherwise
+ */
+ public static boolean isChar(Node node, char character) {
+ if (node instanceof SingleCharNode) {
+ return ((SingleCharNode) node).getCharacter() == character;
+ } else if (node instanceof EscapedSingleCharNode) {
+ return ((EscapedSingleCharNode) node).getCharacter() == character;
+ }
+ return false;
+ }
+
+ /**
+ * Return whether the given node is a character class that would match against the given character.
+ *
+ * @param node
+ * the node
+ * @param character
+ * the character
+ * @return true if the given character class would match against the given character, or false otherwise
+ * @throws IllegalArgumentException
+ * if the given node is not a {@link CharClassNode}
+ */
+ public static boolean charClassMatches(Node node, char character) {
+ if (node instanceof CharClassNode) {
+ CharClassNode charClass = (CharClassNode) node;
+ boolean matchFound = false;
+ for (Node child : charClass.getChildren()) {
+ // If the current child is a single character, see if it is a match for the character.
+ if (child instanceof SingleCharNode) {
+ if (isChar(child, character)) {
+ matchFound = true;
+ break;
+ }
+ } else {
+ // If the current child is a character range, see if it is within the range.
+ CharRangeNode charRange = (CharRangeNode) child;
+ int charDigit = Character.digit(character, RegexConstants.DECIMAL_RADIX);
+ int startDigit = Character.digit(charRange.getStart(), RegexConstants.DECIMAL_RADIX);
+ int endDigit = Character.digit(charRange.getEnd(), RegexConstants.DECIMAL_RADIX);
+ if (startDigit <= charDigit && charDigit <= endDigit) {
+ matchFound = true;
+ break;
+ }
+ }
+ }
+ // If the character class was negated, e.g. [^1-5], it matches against the character if no direct match was found.
+ return charClass.isNegated() != matchFound;
+ } else {
+ throw new IllegalArgumentException("Node must be a " + CharClassNode.class.getSimpleName());
+ }
+ }
+
+ /**
+ * Return whether the given node is a character class that would only match against the given character.
+ *
+ * @param node
+ * the node
+ * @param character
+ * the character
+ * @return true if the given character class would only match against the given character, or false otherwise
+ * @throws IllegalArgumentException
+ * if the given node is not a {@link CharClassNode}
+ */
+ public static boolean charClassMatchesOnly(Node node, char character) {
+ if (node instanceof CharClassNode) {
+ CharClassNode charClass = (CharClassNode) node;
+ boolean matchFound = false;
+ for (Node child : charClass.getChildren()) {
+ // If the current child is a single character, see if it is a match for the character.
+ if (child instanceof SingleCharNode) {
+ if (isChar(child, character)) {
+ matchFound = true;
+ } else {
+ // A character other than the target was found.
+ return false;
+ }
+ } else {
+ // If the current child is a character range, see the range only encompasses the target character, e.g. [1-1].
+ CharRangeNode charRange = (CharRangeNode) child;
+ int charDigit = Character.digit(character, RegexConstants.DECIMAL_RADIX);
+ int startDigit = Character.digit(charRange.getStart(), RegexConstants.DECIMAL_RADIX);
+ int endDigit = Character.digit(charRange.getEnd(), RegexConstants.DECIMAL_RADIX);
+ if (startDigit == charDigit && charDigit == endDigit) {
+ matchFound = true;
+ } else {
+ // A range encompassing characters other than the target was found.
+ return false;
+ }
+ }
+ }
+ // If the character class was negated, e.g. [^1], it matches against the character if no direct match was found.
+ return charClass.isNegated() != matchFound;
+ } else {
+ throw new IllegalArgumentException("Node must be a " + CharClassNode.class.getSimpleName());
+ }
+ }
+
+ /**
+ * Return whether the given node is a regex element that would match against the given character.
+ *
+ * @param node
+ * the regex element
+ * @param character
+ * the character
+ * @return true if the given node would match against the given character, or false otherwise
+ */
+ public static boolean matchesChar(Node node, char character) {
+ switch (node.getType()) {
+ case DIGIT_CHAR_CLASS:
+ case ANY_CHAR:
+ return true;
+ case SINGLE_CHAR:
+ return isChar(node, character);
+ case CHAR_CLASS:
+ return charClassMatches(node, character);
+ default:
+ return false;
+ }
+ }
+
+ public static boolean groupNodeMatches(Node node, char character) {
+ GroupNode group = (GroupNode) node;
+ boolean matchFound = false;
+
+ for (Node child : group.getChildren()) {
+ // If the current child is a single character, see if it is a match for the character.
+ if (child instanceof SingleCharNode) {
+ if (isChar(child, character)) {
+ matchFound = true;
+ } else {
+ // A character other than the target was found, but there may be more in the group
+ continue;
+ }
+ }
+ }
+ return matchFound;
+ }
+
+ /**
+ * Return whether the given node is a regex element that can only match against the given character.
+ *
+ * @param node
+ * the node
+ * @return true if the node can match only against the given character or false otherwise.
+ */
+ public static boolean matchesCharOnly(Node node, char character) {
+ switch (node.getType()) {
+ case SINGLE_CHAR:
+ return isChar(node, character);
+ case CHAR_CLASS:
+ return charClassMatchesOnly(node, character);
+ default:
+ return false;
+ }
+ }
+
+ /**
+ * Return whether the given node is a regex element that can match against the character '0'.
+ *
+ * @param node
+ * the node
+ * @return true if the node can match against '0' or false otherwise.
+ */
+ public static boolean matchesZero(Node node) {
+ return matchesChar(node, RegexConstants.ZERO);
+ }
+
+ public static boolean matchesCharExplicitly(Node node, char character) {
+ switch (node.getType()) {
+ case SINGLE_CHAR:
+ return isChar(node, character);
+ case CHAR_CLASS:
+ return charClassMatches(node, character);
+ case GROUP:
+ return groupNodeMatches(node, character);
+ default:
+ return false;
+ }
+ }
+
+ public static boolean matchesZeroExplicitly(Node node) {
+ return matchesCharExplicitly(node, RegexConstants.ZERO);
+ }
+
+ /**
+ * Return whether the given node is a regex element that can only match against the character '0'.
+ *
+ * @param node
+ * the node
+ * @return true if the node can match only against '0' or false otherwise.
+ */
+ public static boolean matchesZeroOnly(Node node) {
+ return matchesCharOnly(node, RegexConstants.ZERO);
+ }
+
+ /**
+ * Return whether the given node is a quantifier type.
+ *
+ * @param node
+ * the node
+ * @return true if the node is a quantifier type, or false otherwise
+ */
+ public static boolean isQuantifier(Node node) {
+ return RegexConstants.QUANTIFIER_TYPES.contains(node.getClass());
+ }
+
+ /**
+ * Return a range representing the number of occurrences the given node can match against. The left side will be at a minimum, 0, and the right side may be
+ * a number, or null (infinity).
+ *
+ * @param node
+ * the node
+ * @return the occurrence range
+ * @throws IllegalArgumentException
+ * if the given node is not a quantifier type
+ */
+ public static Pair getQuantifierRange(Node node) {
+ if (!isQuantifier(node)) {
+ throw new IllegalArgumentException("Node must be one of the following quantifier types: " + RegexConstants.QUANTIFIER_TYPES);
+ }
+ int min;
+ Integer max = null;
+ switch (node.getType()) {
+ case ZERO_OR_MORE:
+ // Minimum occurrence of 0.
+ min = 0;
+ break;
+ case ONE_OR_MORE:
+ // Minimum occurrence of 1.
+ min = 1;
+ break;
+ case REPETITION:
+ Node child = node.getFirstChild();
+ if (child instanceof IntegerNode) {
+ // Minimum and maximum occurrences will be the same.
+ min = ((IntegerNode) child).getValue();
+ max = min;
+ } else {
+ IntegerRangeNode rangeNode = (IntegerRangeNode) child;
+ // Minimum is defined in range. Maximum may be infinity if not defined.
+ min = rangeNode.getStart();
+ if (rangeNode.isEndBounded()) {
+ max = rangeNode.getEnd();
+ }
+ }
+ break;
+ default:
+ throw new IllegalArgumentException("Unhandled quantifier type: " + RegexConstants.QUANTIFIER_TYPES);
+ }
+ return Pair.of(min, max);
+ }
+
+ /**
+ * Return whether the given node represents a simple number regex.
+ *
+ * @param node
+ * the node
+ * @return true if the node is a simple number regex, or false otherwise
+ */
+ public static boolean isSimpleNumber(Node node) {
+ if (node.isAnyChildNotOf(RegexConstants.SIMPLE_NUMBER_TYPES)) {
+ return false;
+ }
+ String expression = StringVisitor.toString(node);
+ return RegexConstants.SIMPLE_NUMBER_REGEX_PATTERN.matcher(expression).matches();
+ }
+
+ /**
+ * Return the given digit character as an integer.
+ *
+ * @param digit
+ * the digit character
+ * @return the integer form
+ */
+ public static int toInt(char digit) {
+ return Character.digit(digit, RegexConstants.DECIMAL_RADIX);
+ }
+
+ /**
+ * Return the given int as a digit character.
+ *
+ * @param digit
+ * the int
+ * @return the digit character
+ */
+ public static char toChar(int digit) {
+ return Character.forDigit(digit, RegexConstants.DECIMAL_RADIX);
+ }
+
+ /**
+ * Return whether the given quantifier node allows for zero occurrences.
+ *
+ * @param node
+ * the node
+ * @return true if the quantifier allows for zero occurrences, or false otherwise
+ */
+ public static boolean canOccurZeroTimes(Node node) {
+ if (!isQuantifier(node)) {
+ throw new IllegalArgumentException("Node must be one of the following quantifier types: " + RegexConstants.QUANTIFIER_TYPES);
+ }
+ switch (node.getType()) {
+ case ZERO_OR_MORE:
+ return true;
+ case ONE_OR_MORE:
+ return false;
+ case REPETITION:
+ return repetitionCanOccurZeroTimes((RepetitionNode) node);
+ default:
+ throw new IllegalArgumentException("Unhandled quantifier type: " + RegexConstants.QUANTIFIER_TYPES);
+ }
+ }
+
+ /**
+ * Return whether the given repetition quantifier node allows for zero occurrences.
+ *
+ * @param node
+ * the node
+ * @return true if the quantifier allows for zero occurrences, or false otherwise
+ */
+ public static boolean repetitionCanOccurZeroTimes(RepetitionNode node) {
+ Node child = node.getFirstChild();
+ if (child instanceof IntegerNode) {
+ return ((IntegerNode) child).getValue() == 0;
+ } else {
+ return ((IntegerRangeNode) child).getStart() == 0;
+ }
+ }
+
+ /**
+ * Return the given repetition as an occurrence range.
+ *
+ * @param node
+ * the node
+ * @return the range
+ */
+ public static Pair getRepetitionAsRange(RepetitionNode node) {
+ Node child = node.getFirstChild();
+ if (child instanceof IntegerNode) {
+ int value = ((IntegerNode) child).getValue();
+ return Pair.of(value, value);
+ } else {
+ IntegerRangeNode integerRange = (IntegerRangeNode) child;
+ if (integerRange.isEndBounded()) {
+ return Pair.of(integerRange.getStart(), integerRange.getEnd());
+ } else {
+ return Pair.of(integerRange.getStart(), null);
+ }
+ }
+ }
+
+ /**
+ * Subtract one from the given range endpoints and return it.
+ *
+ * @param range
+ * the range
+ * @return the updated range
+ */
+ public static Pair subtractOneFrom(Pair range) {
+ int left = range.getLeft() > 0 ? (range.getLeft() - 1) : 0;
+ Integer right = range.getRight() == null ? null : (range.getRight() - 1);
+ return Pair.of(left, right);
+ }
+
+ /**
+ * Return a new repetition node created from the given range.
+ *
+ * @param range
+ * the range
+ * @return the new repetition node
+ */
+ public static RepetitionNode createRepetition(Pair range) {
+ if (Objects.equals(range.getLeft(), range.getRight())) {
+ return new RepetitionNode(new IntegerNode(range.getLeft()));
+ } else {
+ return new RepetitionNode(new IntegerRangeNode(range.getLeft(), range.getRight()));
+ }
+ }
+
+ /**
+ * Return whether the given repetition quantifier is not a defined range, e.g. {x} rather than {x,y} or {x,}.
+ *
+ * @param node
+ * the node
+ * @return true if the repetition is not a range, or false otherwise
+ */
+ public static boolean isNotRange(RepetitionNode node) {
+ return node.getFirstChild() instanceof IntegerNode;
+ }
+
+ /**
+ * Return a copy of the given repetition as a range starting from zero.
+ *
+ * @param node
+ * the node
+ * @return the new repetition quantifier
+ */
+ public static RepetitionNode createRangeStartingFromZero(RepetitionNode node) {
+ IntegerRangeNode range = new IntegerRangeNode();
+ range.setStart(0);
+ Node child = node.getFirstChild();
+ if (child instanceof IntegerNode) {
+ range.setEnd(((IntegerNode) child).getValue());
+ } else {
+ range.setEnd(((IntegerRangeNode) child).getEnd());
+ }
+ return new RepetitionNode(range);
+ }
+
+ private RegexUtils() {
+ throw new UnsupportedOperationException();
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RepetitionNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RepetitionNode.java
new file mode 100644
index 00000000000..b31f006b0ce
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/RepetitionNode.java
@@ -0,0 +1,36 @@
+package datawave.data.normalizer.regex;
+
+import java.util.Map;
+
+import datawave.data.normalizer.regex.visitor.Visitor;
+
+/**
+ * Represents a repetition requirement in a regex pattern, e.g. {@code {3}}.
+ */
+public class RepetitionNode extends Node {
+
+ public RepetitionNode() {}
+
+ public RepetitionNode(Node child) {
+ super(child);
+ }
+
+ public RepetitionNode(Map properties) {
+ super(properties);
+ }
+
+ @Override
+ public NodeType getType() {
+ return NodeType.REPETITION;
+ }
+
+ @Override
+ public Object accept(Visitor visitor, Object data) {
+ return visitor.visitRepetition(this, data);
+ }
+
+ @Override
+ public RepetitionNode shallowCopy() {
+ return new RepetitionNode(this.properties);
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/SingleCharNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/SingleCharNode.java
new file mode 100644
index 00000000000..50e42a9621d
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/SingleCharNode.java
@@ -0,0 +1,46 @@
+package datawave.data.normalizer.regex;
+
+import java.util.Map;
+
+import datawave.data.normalizer.regex.visitor.Visitor;
+
+/**
+ * Represents a single, non-special character in a regex pattern.
+ */
+public class SingleCharNode extends Node {
+
+ public static final String PROPERTY_CHAR = "char";
+
+ public SingleCharNode(char character) {
+ setCharacter(character);
+ }
+
+ public char getCharacter() {
+ return getProperty(PROPERTY_CHAR).charAt(0);
+ }
+
+ public void setCharacter(char character) {
+ setProperty(PROPERTY_CHAR, String.valueOf(character));
+ }
+
+ public SingleCharNode() {}
+
+ public SingleCharNode(Map properties) {
+ super(properties);
+ }
+
+ @Override
+ public NodeType getType() {
+ return NodeType.SINGLE_CHAR;
+ }
+
+ @Override
+ public Object accept(Visitor visitor, Object data) {
+ return visitor.visitSingleChar(this, data);
+ }
+
+ @Override
+ public SingleCharNode shallowCopy() {
+ return new SingleCharNode(this.properties);
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/StartAnchorNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/StartAnchorNode.java
new file mode 100644
index 00000000000..77d3831931d
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/StartAnchorNode.java
@@ -0,0 +1,34 @@
+package datawave.data.normalizer.regex;
+
+import java.util.Map;
+
+import datawave.data.normalizer.regex.visitor.Visitor;
+
+/**
+ * Represents a regex start anchor, i.e. {@code ^}.
+ */
+public class StartAnchorNode extends Node {
+
+ protected StartAnchorNode() {
+ super();
+ }
+
+ public StartAnchorNode(Map properties) {
+ super(properties);
+ }
+
+ @Override
+ public NodeType getType() {
+ return NodeType.START_ANCHOR;
+ }
+
+ @Override
+ public Object accept(Visitor visitor, Object data) {
+ return visitor.visitStartAnchor(this, data);
+ }
+
+ @Override
+ public Node shallowCopy() {
+ return new StartAnchorNode(this.properties);
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/ZeroOrMoreNode.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/ZeroOrMoreNode.java
new file mode 100644
index 00000000000..547ab550fad
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/ZeroOrMoreNode.java
@@ -0,0 +1,32 @@
+package datawave.data.normalizer.regex;
+
+import java.util.Map;
+
+import datawave.data.normalizer.regex.visitor.Visitor;
+
+/**
+ * Represents the star in a regex pattern.
+ */
+public class ZeroOrMoreNode extends Node {
+
+ public ZeroOrMoreNode() {}
+
+ public ZeroOrMoreNode(Map properties) {
+ super(properties);
+ }
+
+ @Override
+ public NodeType getType() {
+ return NodeType.ZERO_OR_MORE;
+ }
+
+ @Override
+ public Object accept(Visitor visitor, Object data) {
+ return visitor.visitZeroToMany(this, data);
+ }
+
+ @Override
+ public ZeroOrMoreNode shallowCopy() {
+ return new ZeroOrMoreNode(this.properties);
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/AlternationDeduper.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/AlternationDeduper.java
new file mode 100644
index 00000000000..8585bee40d6
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/AlternationDeduper.java
@@ -0,0 +1,57 @@
+package datawave.data.normalizer.regex.visitor;
+
+import java.util.LinkedHashMap;
+
+import datawave.data.normalizer.regex.AlternationNode;
+import datawave.data.normalizer.regex.ExpressionNode;
+import datawave.data.normalizer.regex.Node;
+
+public class AlternationDeduper extends CopyVisitor {
+
+ public static Node dedupe(Node node) {
+ if (node == null) {
+ return null;
+ }
+ AlternationDeduper visitor = new AlternationDeduper();
+ return (Node) node.accept(visitor, null);
+ }
+
+ @Override
+ public Object visitExpression(ExpressionNode node, Object data) {
+ // If the node holds an alternation, dedupe the alternation's children.
+ if (node.getFirstChild() instanceof AlternationNode) {
+ Node visited = (Node) node.getFirstChild().accept(this, data);
+ // If an alternation was returned, multiple patterns were retained. Wrap it in an expression node before returning.
+ if (visited instanceof AlternationNode) {
+ return new ExpressionNode(visited);
+ } else {
+ // Otherwise we only have a single pattern remaining. Return the node as is.
+ return visited;
+ }
+ } else {
+ // Otherwise this tree does not hold any alternations. Return a copy.
+ return copy(node);
+ }
+ }
+
+ @Override
+ public Object visitAlternation(AlternationNode node, Object data) {
+ // Use LinkedHashMap to preserve insertion order.
+ LinkedHashMap uniquePatterns = new LinkedHashMap<>();
+ // Check each child for uniqueness.
+ for (Node child : node.getChildren()) {
+ String childPattern = StringVisitor.toString(child);
+ // If the child has a pattern we have not seen before, retain a copy of it.
+ if (!uniquePatterns.containsKey(childPattern)) {
+ uniquePatterns.put(childPattern, copy(child));
+ }
+ }
+
+ // If only one
+ if (uniquePatterns.size() == 1) {
+ return uniquePatterns.values().iterator().next();
+ } else {
+ return new AlternationNode(uniquePatterns.values());
+ }
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/AnchorTrimmer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/AnchorTrimmer.java
new file mode 100644
index 00000000000..2925e43c380
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/AnchorTrimmer.java
@@ -0,0 +1,29 @@
+package datawave.data.normalizer.regex.visitor;
+
+import datawave.data.normalizer.regex.EndAnchorNode;
+import datawave.data.normalizer.regex.Node;
+import datawave.data.normalizer.regex.StartAnchorNode;
+
+/**
+ * Implementation of {@link CopyVisitor} that returns a copy of a regex tree trimmed of all start and end anchors to simplify the normalization process.
+ */
+public class AnchorTrimmer extends CopyVisitor {
+
+ public static Node trim(Node node) {
+ if (node == null) {
+ return null;
+ }
+ AnchorTrimmer visitor = new AnchorTrimmer();
+ return (Node) node.accept(visitor, null);
+ }
+
+ @Override
+ public Object visitStartAnchor(StartAnchorNode node, Object data) {
+ return null;
+ }
+
+ @Override
+ public Object visitEndAnchor(EndAnchorNode node, Object data) {
+ return null;
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/BaseVisitor.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/BaseVisitor.java
new file mode 100644
index 00000000000..a31aea2c57e
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/BaseVisitor.java
@@ -0,0 +1,148 @@
+package datawave.data.normalizer.regex.visitor;
+
+import datawave.data.normalizer.regex.AlternationNode;
+import datawave.data.normalizer.regex.AnyCharNode;
+import datawave.data.normalizer.regex.CharClassNode;
+import datawave.data.normalizer.regex.CharRangeNode;
+import datawave.data.normalizer.regex.DigitCharClassNode;
+import datawave.data.normalizer.regex.EmptyNode;
+import datawave.data.normalizer.regex.EncodedNumberNode;
+import datawave.data.normalizer.regex.EncodedPatternNode;
+import datawave.data.normalizer.regex.EndAnchorNode;
+import datawave.data.normalizer.regex.EscapedSingleCharNode;
+import datawave.data.normalizer.regex.ExpressionNode;
+import datawave.data.normalizer.regex.GroupNode;
+import datawave.data.normalizer.regex.IntegerNode;
+import datawave.data.normalizer.regex.IntegerRangeNode;
+import datawave.data.normalizer.regex.OneOrMoreNode;
+import datawave.data.normalizer.regex.QuestionMarkNode;
+import datawave.data.normalizer.regex.RepetitionNode;
+import datawave.data.normalizer.regex.SingleCharNode;
+import datawave.data.normalizer.regex.StartAnchorNode;
+import datawave.data.normalizer.regex.ZeroOrMoreNode;
+
+/**
+ * A basic {@link Visitor} implementation that will pass itself to the children of any node that accepts it.
+ */
+public class BaseVisitor implements Visitor {
+
+ @Override
+ public Object visitExpression(ExpressionNode node, Object data) {
+ node.childrenAccept(this, data);
+ return data;
+ }
+
+ @Override
+ public Object visitAlternation(AlternationNode node, Object data) {
+ node.childrenAccept(this, data);
+ return data;
+ }
+
+ @Override
+ public Object visitGroup(GroupNode node, Object data) {
+ node.childrenAccept(this, data);
+ return data;
+ }
+
+ @Override
+ public Object visitDigitChar(DigitCharClassNode node, Object data) {
+ node.childrenAccept(this, data);
+ return data;
+ }
+
+ @Override
+ public Object visitCharClass(CharClassNode node, Object data) {
+ node.childrenAccept(this, data);
+ return data;
+ }
+
+ @Override
+ public Object visitCharRange(CharRangeNode node, Object data) {
+ node.childrenAccept(this, data);
+ return data;
+ }
+
+ @Override
+ public Object visitSingleChar(SingleCharNode node, Object data) {
+ node.childrenAccept(this, data);
+ return data;
+ }
+
+ @Override
+ public Object visitEscapedSingleChar(EscapedSingleCharNode node, Object data) {
+ node.childrenAccept(this, data);
+ return data;
+ }
+
+ @Override
+ public Object visitRepetition(RepetitionNode node, Object data) {
+ node.childrenAccept(this, data);
+ return data;
+ }
+
+ @Override
+ public Object visitQuestionMark(QuestionMarkNode node, Object data) {
+ node.childrenAccept(this, data);
+ return data;
+ }
+
+ @Override
+ public Object visitAnyChar(AnyCharNode node, Object data) {
+ node.childrenAccept(this, data);
+ return data;
+ }
+
+ @Override
+ public Object visitZeroToMany(ZeroOrMoreNode node, Object data) {
+ node.childrenAccept(this, data);
+ return data;
+ }
+
+ @Override
+ public Object visitOneToMany(OneOrMoreNode node, Object data) {
+ node.childrenAccept(this, data);
+ return data;
+ }
+
+ @Override
+ public Object visitInteger(IntegerNode node, Object data) {
+ node.childrenAccept(this, data);
+ return data;
+ }
+
+ @Override
+ public Object visitIntegerRange(IntegerRangeNode node, Object data) {
+ node.childrenAccept(this, data);
+ return data;
+ }
+
+ @Override
+ public Object visitEmpty(EmptyNode node, Object data) {
+ node.childrenAccept(this, data);
+ return data;
+ }
+
+ @Override
+ public Object visitStartAnchor(StartAnchorNode node, Object data) {
+ node.childrenAccept(this, data);
+ return data;
+ }
+
+ @Override
+ public Object visitEndAnchor(EndAnchorNode node, Object data) {
+ node.childrenAccept(this, data);
+ return data;
+ }
+
+ @Override
+ public Object visitEncodedNumber(EncodedNumberNode node, Object data) {
+ node.childrenAccept(this, data);
+ return data;
+ }
+
+ @Override
+ public Object visitEncodedPattern(EncodedPatternNode node, Object data) {
+ node.childrenAccept(this, data);
+ return data;
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/BinFinder.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/BinFinder.java
new file mode 100644
index 00000000000..7f6fe611842
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/BinFinder.java
@@ -0,0 +1,210 @@
+package datawave.data.normalizer.regex.visitor;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import datawave.data.normalizer.regex.IntegerNode;
+import datawave.data.normalizer.regex.IntegerRangeNode;
+import datawave.data.normalizer.regex.Node;
+import datawave.data.normalizer.regex.NodeListIterator;
+import datawave.data.normalizer.regex.RegexConstants;
+import datawave.data.normalizer.regex.RegexUtils;
+
+/**
+ * Abstract class for {@link LTOneBinFinder} and {@link GTEOneBinFinder} with common properties and functionality.
+ */
+abstract class BinFinder {
+
+ // The original node.
+ protected final Node node;
+
+ // An iterator for the node's children.
+ protected final NodeListIterator childrenIter;
+
+ // The index of the decimal point in the node's children, possibly -1.
+ protected final int decimalPointIndex;
+
+ // The smallest bin value.
+ protected final int minBin;
+
+ // The highest bin value.
+ protected final int maxBin;
+
+ // The initial value for the lower and upper endpoints.
+ protected final int initialEndpointValue;
+
+ // The current lower end of the bin range.
+ protected int lower;
+
+ // The current upper end of the bin range.
+ protected int upper;
+
+ protected boolean lowerLocked;
+
+ protected BinFinder(Node node, int minBin, int maxBin, int initialEndpointValue) {
+ this.node = node;
+ this.decimalPointIndex = RegexUtils.getDecimalPointIndex(node);
+ this.minBin = minBin;
+ this.maxBin = maxBin;
+ this.initialEndpointValue = initialEndpointValue;
+ this.childrenIter = node.getChildrenIterator();
+
+ // Set the initial end point values.
+ this.lower = initialEndpointValue;
+ this.upper = initialEndpointValue;
+
+ // If the first child is a hyphen, skip over it and start at the next child.
+ if (RegexUtils.isChar(node.getFirstChild(), RegexConstants.HYPHEN)) {
+ childrenIter.next();
+ }
+ }
+
+ protected abstract Pair getBinRange();
+
+ /**
+ * Increment lower by one.
+ */
+ protected void incrementLower() {
+ if (!lowerLocked) {
+ lower++;
+ }
+ }
+
+ /**
+ * Increment lower by the given value.
+ *
+ * @param value
+ * the value
+ */
+ protected void incrementLower(int value) {
+ if (!lowerLocked) {
+ lower += value;
+ }
+ }
+
+ /**
+ * Lock modifications to the lower bound. Any subsequent calls to {@link #incrementLower()} or {@link #incrementLower(int)} will not modify the lower bound.
+ */
+ protected void lockLower() {
+ this.lowerLocked = true;
+ }
+
+ /**
+ * Unlock modifications to the lower bound. Any subsequent calls to {@link #incrementLower()} or {@link #incrementLower(int)} will modify the lower bound.
+ */
+ protected void unlockLower() {
+ this.lowerLocked = false;
+ }
+
+ /**
+ * Set lower to the initial endpoint value.
+ */
+ protected void setLowerToInitialEndpointValue() {
+ this.lower = initialEndpointValue;
+ }
+
+ /**
+ * Increment upper by one.
+ */
+ protected void incrementUpper() {
+ upper++;
+ }
+
+ /**
+ * Increment upper by the given value.
+ *
+ * @param value
+ * the value
+ */
+ protected void incrementUpper(int value) {
+ upper += value;
+ }
+
+ /**
+ * Set upper to the max bin value.
+ */
+ protected void setUpperToMax() {
+ upper = maxBin;
+ }
+
+ /**
+ * Normalize the endpoints to be within the min and max bin if they were updated.
+ */
+ protected void normalizeRange() {
+ // Do not normalize if both the upper and lower are the initial endpoint value. This indicates that a valid bin range was not found.
+ if (lower != initialEndpointValue || upper != initialEndpointValue) {
+ // Normalize the bin range to be within a valid bin range. If the lower bound is less than the min bin, set it to the min bin. If it is greater than
+ // the max bin, set it to the max bin.
+ if (lower < minBin) {
+ lower = minBin;
+ } else if (lower > maxBin) {
+ lower = maxBin;
+ }
+
+ // If the upper bound is greater than the max bin, set it to the max bin.
+ if (upper > maxBin) {
+ upper = maxBin;
+ }
+ }
+ }
+
+ /**
+ * Return a {@link Pair} with the lower and upper bin range endpoints, or null if no valid bin range was found.
+ *
+ * @return the bin range
+ */
+ protected Pair getEndpoints() {
+ if (lower != initialEndpointValue || upper != initialEndpointValue) {
+ return Pair.of(lower, upper);
+ } else {
+ return null;
+ }
+ }
+
+ /**
+ * Update lower and upper based on the quantities read from the next quantifier in the iterator.
+ */
+ protected void updateRangeWithNextQuantifier() {
+ // Update the range.
+ updateRangeWithQuantifier(childrenIter.next());
+ // If the node after the quantifier node is an question mark, skip over it.
+ childrenIter.seekPastQuestionMarks();
+ }
+
+ /**
+ * Update lower and upper based off the quantities read from the next quantifier.
+ */
+ protected void updateRangeWithQuantifier(Node quantifier) {
+ switch (quantifier.getType()) {
+ case REPETITION:
+ // In the case of a repetition node, we may have an IntegerNode or IntegerRangeNode child.
+ Node child = quantifier.getFirstChild();
+ if (child instanceof IntegerNode) {
+ // Increment both the upper and lower bound by the repetition value.
+ int value = ((IntegerNode) child).getValue();
+ incrementLower(value);
+ incrementUpper(value);
+ } else {
+ IntegerRangeNode rangeNode = (IntegerRangeNode) child;
+ // Increment the lower bound by the range start value.
+ incrementLower(rangeNode.getStart());
+ // If the end of the range has a bound, increment the upper bound by the end bound. Otherwise, set the upper bound to the max.
+ if (rangeNode.isEndBounded()) {
+ incrementUpper(rangeNode.getEnd());
+ } else {
+ setUpperToMax();
+ }
+ }
+ break;
+ case ZERO_OR_MORE:
+ // Set the upper to the max. Do not modify the lower bound.
+ setUpperToMax();
+ break;
+ case ONE_OR_MORE:
+ // Set the upper bound to the max.
+ setUpperToMax();
+ // Increment the lower bound by one.
+ incrementLower();
+ break;
+ }
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/CopyVisitor.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/CopyVisitor.java
new file mode 100644
index 00000000000..ce7620c1ec9
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/CopyVisitor.java
@@ -0,0 +1,161 @@
+package datawave.data.normalizer.regex.visitor;
+
+import java.util.Objects;
+
+import datawave.data.normalizer.regex.AlternationNode;
+import datawave.data.normalizer.regex.AnyCharNode;
+import datawave.data.normalizer.regex.CharClassNode;
+import datawave.data.normalizer.regex.CharRangeNode;
+import datawave.data.normalizer.regex.DigitCharClassNode;
+import datawave.data.normalizer.regex.EmptyNode;
+import datawave.data.normalizer.regex.EncodedNumberNode;
+import datawave.data.normalizer.regex.EncodedPatternNode;
+import datawave.data.normalizer.regex.EndAnchorNode;
+import datawave.data.normalizer.regex.EscapedSingleCharNode;
+import datawave.data.normalizer.regex.ExpressionNode;
+import datawave.data.normalizer.regex.GroupNode;
+import datawave.data.normalizer.regex.IntegerNode;
+import datawave.data.normalizer.regex.IntegerRangeNode;
+import datawave.data.normalizer.regex.Node;
+import datawave.data.normalizer.regex.OneOrMoreNode;
+import datawave.data.normalizer.regex.QuestionMarkNode;
+import datawave.data.normalizer.regex.RepetitionNode;
+import datawave.data.normalizer.regex.SingleCharNode;
+import datawave.data.normalizer.regex.StartAnchorNode;
+import datawave.data.normalizer.regex.ZeroOrMoreNode;
+
+/**
+ * A {@link Visitor} implementation that returns a copy of a given {@link Node} tree.
+ */
+public class CopyVisitor implements Visitor {
+
+ /**
+ * Return a copy of the given node tree, or null if the node is null. Any null children will be filtered out.
+ *
+ * @param node
+ * the tree to copy
+ * @return the copy
+ */
+ public static Node copy(Node node) {
+ if (node == null) {
+ return null;
+ }
+ CopyVisitor visitor = new CopyVisitor();
+ return (Node) node.accept(visitor, null);
+ }
+
+ /**
+ * Return a copy of the given node.
+ *
+ * @param node
+ * the node to copy
+ * @param data
+ * the data
+ * @return the copy
+ */
+ protected Node copy(Node node, Object data) {
+ Node copy = node.shallowCopy();
+ node.getChildren().stream().map((child) -> (Node) child.accept(this, data)).filter(Objects::nonNull).forEach(copy::addChild);
+ return copy;
+ }
+
+ @Override
+ public Object visitExpression(ExpressionNode node, Object data) {
+ return copy(node, data);
+ }
+
+ @Override
+ public Object visitAlternation(AlternationNode node, Object data) {
+ return copy(node, data);
+ }
+
+ @Override
+ public Object visitGroup(GroupNode node, Object data) {
+ return copy(node, data);
+ }
+
+ @Override
+ public Object visitDigitChar(DigitCharClassNode node, Object data) {
+ return copy(node, data);
+ }
+
+ @Override
+ public Object visitCharClass(CharClassNode node, Object data) {
+ return copy(node, data);
+ }
+
+ @Override
+ public Object visitCharRange(CharRangeNode node, Object data) {
+ return copy(node, data);
+ }
+
+ @Override
+ public Object visitSingleChar(SingleCharNode node, Object data) {
+ return copy(node, data);
+ }
+
+ @Override
+ public Object visitEscapedSingleChar(EscapedSingleCharNode node, Object data) {
+ return copy(node, data);
+ }
+
+ @Override
+ public Object visitRepetition(RepetitionNode node, Object data) {
+ return copy(node, data);
+ }
+
+ @Override
+ public Object visitQuestionMark(QuestionMarkNode node, Object data) {
+ return copy(node, data);
+ }
+
+ @Override
+ public Object visitAnyChar(AnyCharNode node, Object data) {
+ return copy(node, data);
+ }
+
+ @Override
+ public Object visitZeroToMany(ZeroOrMoreNode node, Object data) {
+ return copy(node, data);
+ }
+
+ @Override
+ public Object visitOneToMany(OneOrMoreNode node, Object data) {
+ return copy(node, data);
+ }
+
+ @Override
+ public Object visitInteger(IntegerNode node, Object data) {
+ return copy(node, data);
+ }
+
+ @Override
+ public Object visitIntegerRange(IntegerRangeNode node, Object data) {
+ return copy(node, data);
+ }
+
+ @Override
+ public Object visitEmpty(EmptyNode node, Object data) {
+ return copy(node, data);
+ }
+
+ @Override
+ public Object visitStartAnchor(StartAnchorNode node, Object data) {
+ return copy(node, data);
+ }
+
+ @Override
+ public Object visitEndAnchor(EndAnchorNode node, Object data) {
+ return copy(node, data);
+ }
+
+ @Override
+ public Object visitEncodedNumber(EncodedNumberNode node, Object data) {
+ return copy(node, data);
+ }
+
+ @Override
+ public Object visitEncodedPattern(EncodedPatternNode node, Object data) {
+ return copy(node, data);
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/DecimalPointPlacer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/DecimalPointPlacer.java
new file mode 100644
index 00000000000..019d91a9e14
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/DecimalPointPlacer.java
@@ -0,0 +1,561 @@
+package datawave.data.normalizer.regex.visitor;
+
+import static datawave.data.normalizer.regex.RegexUtils.createRepetition;
+import static datawave.data.normalizer.regex.RegexUtils.getRepetitionAsRange;
+import static datawave.data.normalizer.regex.RegexUtils.subtractOneFrom;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import datawave.data.normalizer.regex.AlternationNode;
+import datawave.data.normalizer.regex.EncodedPatternNode;
+import datawave.data.normalizer.regex.EscapedSingleCharNode;
+import datawave.data.normalizer.regex.ExpressionNode;
+import datawave.data.normalizer.regex.GroupNode;
+import datawave.data.normalizer.regex.Node;
+import datawave.data.normalizer.regex.NodeListIterator;
+import datawave.data.normalizer.regex.NodeType;
+import datawave.data.normalizer.regex.OneOrMoreNode;
+import datawave.data.normalizer.regex.QuestionMarkNode;
+import datawave.data.normalizer.regex.RegexConstants;
+import datawave.data.normalizer.regex.RegexUtils;
+import datawave.data.normalizer.regex.RepetitionNode;
+import datawave.data.normalizer.regex.ZeroOrMoreNode;
+
+/**
+ * Implementation of {@link CopyVisitor} that return a copy of a regex tree with decimal places inserted where required in encoded regex patterns. Patterns
+ * starting with an element that has a quantifier {@code (* + or {x})} will see the quantifier modified as required to ensure a decimal place is inserted
+ * correctly. Multiple optional decimal points may be added to a single regex pattern.
+ */
+public class DecimalPointPlacer extends CopyVisitor {
+
+ public static Node addDecimalPoints(Node node) {
+ if (node == null) {
+ return null;
+ }
+ DecimalPointPlacer visitor = new DecimalPointPlacer();
+ return (Node) node.accept(visitor, null);
+ }
+
+ @Override
+ public Object visitEncodedPattern(EncodedPatternNode node, Object data) {
+ // Operate on a copy of the node.
+ Node copy = copy(node);
+
+ // Create an initial encoded pattern node with all the leading bin info.
+ EncodedPatternNode encodedPattern = new EncodedPatternNode();
+ NodeListIterator iter = copy.getChildrenIterator();
+ while (iter.hasNext()) {
+ Node next = iter.next();
+ encodedPattern.addChild(next);
+ if (RegexUtils.isChar(next, RegexConstants.CAPITAL_E)) {
+ break;
+ }
+ }
+
+ // Determine what character is equivalent to the zero character. For patterns matching positive numbers, this is '0'. For patterns matching negative
+ // numbers, this is '9'.
+ boolean positiveNumber = RegexUtils.isChar(node.getFirstChild(), RegexConstants.PLUS);
+ char zeroChar = positiveNumber ? RegexConstants.ZERO : RegexConstants.NINE;
+
+ // Get a list of nodes with decimal points added and add them to the pattern node.
+ DecimalPointAdder adder = new DecimalPointAdder(iter, zeroChar);
+ List nodes = adder.addDecimalPoints();
+ encodedPattern.addChildren(nodes);
+
+ // Add the remaining children to the pattern node.
+ while (iter.hasNext()) {
+ encodedPattern.addChild(iter.next());
+ }
+ return encodedPattern;
+ }
+
+ private static class DecimalPointAdder {
+
+ // The node iterator.
+ private final NodeListIterator iter;
+
+ // The character that is the equivalent to zero. For patterns matching positive numbers: '0'. For patterns matching negative numbers: '9'.
+ private final char zeroChar;
+
+ // The nodes enriched with decimal points.
+ private final List nodes = new ArrayList<>();
+
+ // The most recent element.
+ private Node currentElement;
+
+ // The most recent quantifier.
+ private Node currentQuantifier;
+
+ // The most recent question mark.
+ private Node currentQuestionMark;
+
+ // Whether any decimal points have been added.
+ boolean addedAnyDecimalPoints;
+
+ // Whether additional optional decimal points should be added.
+ boolean addMoreDecimalPoints = true;
+
+ // Whether a non-leading zero has been seen.
+ boolean nonLeadingZeroSeen = false;
+
+ public DecimalPointAdder(NodeListIterator iter, char zeroChar) {
+ this(iter, zeroChar, false);
+ }
+
+ private DecimalPointAdder(NodeListIterator iter, char zeroChar, boolean addedAnyDecimalPoints) {
+ this.iter = iter;
+ this.zeroChar = zeroChar;
+ this.addedAnyDecimalPoints = addedAnyDecimalPoints;
+ }
+
+ /**
+ * Return a list of nodes enriched with decimal points. This list is not guaranteed to contain all nodes found within the iterator supplied to
+ * {@link #DecimalPointAdder(NodeListIterator, char)}, so subsequent calls to {@link NodeListIterator#next()} should be made to the iterator after the
+ * fact to retrieve any remaining nodes.
+ */
+ public List addDecimalPoints() {
+ // If we can skip adding decimal points, do so.
+ if (skipAddingDecimalPoints()) {
+ return nodes;
+ }
+
+ // Add decimal points until either there are no more elements or if we have created a final decimal point.
+ while (iter.hasNext() && addMoreDecimalPoints) {
+ // Capture the current element, quantifier, and optional.
+ captureNext();
+
+ switch (currentElement.getType()) {
+ case GROUP:
+ addGroup();
+ break;
+ case ANY_CHAR:
+ case CHAR_CLASS:
+ case DIGIT_CHAR_CLASS:
+ case SINGLE_CHAR:
+ // If we have seen a non-leading zero, mark it so.
+ if (!matchesZero(currentElement)) {
+ nonLeadingZeroSeen = true;
+ }
+ // Quantified characters must be handled differently from non-quantified characters.
+ if (currentQuantifier == null) {
+ addNonQuantifiedElement();
+ } else {
+ addQuantifiedElement();
+ }
+ break;
+ default:
+ throw new IllegalArgumentException("Unhandled element type: " + currentElement.getType());
+ }
+
+ // Mark whether we've added any decimal points only after processing the first decimal point.
+ addedAnyDecimalPoints = true;
+ }
+
+ return nodes;
+ }
+
+ /**
+ * Return whether the entire pattern after the bin information consists of .*, .+, or a non-quantified element.
+ *
+ * @return true if decimal points do not need to be added to this pattern, or false otherwise
+ */
+ private boolean skipAddingDecimalPoints() {
+ int originalIndex = iter.index();
+ try {
+ Node element = iter.next();
+ Node quantifier = iter.isNextQuantifier() ? iter.next() : null;
+ iter.seekPastQuestionMarks();
+
+ // If there is a second element, we cannot skip adding decimal points.
+ if (iter.hasNext()) {
+ return false;
+ } else {
+ // If the sole element is a wildcard, we do not need to add decimal points if it is '.' '.*' or '.+'.
+ if (element.getType() == NodeType.ANY_CHAR) {
+ return quantifier == null || quantifier instanceof ZeroOrMoreNode || quantifier instanceof OneOrMoreNode;
+ } else if (element.getType() == NodeType.GROUP) {
+ // If the sole element is a group, we likely need to add decimal points.
+ return false;
+ } else {
+ // If sole element is not a wildcard, but has no quantifier, we do not need to add decimal points.
+ return quantifier == null;
+ }
+ }
+
+ } finally {
+ iter.setIndex(originalIndex);
+ }
+ }
+
+ /**
+ * Capture the next element, quantifier, and optional.
+ */
+ private void captureNext() {
+ currentElement = iter.next();
+ currentQuantifier = iter.isNextQuantifier() ? iter.next() : null;
+ currentQuestionMark = iter.isNextQuestionMark() ? iter.next() : null;
+ }
+
+ /**
+ * The current element is either an optional group of leading zeros with a defined range that must occur more than once, or a group of ending
+ * alternations.
+ */
+ private void addGroup() {
+ if (currentElement.getFirstChild().getType() == NodeType.ALTERNATION) {
+ addEndingAlternationsGroup();
+ } else {
+ addLeadingZeroGroup();
+ }
+ }
+
+ private void addLeadingZeroGroup() {
+ Node innerElement = currentElement.getFirstChild();
+ Node innerQuantifier = currentElement.getChildAt(1);
+ Node innerQuestionMark = currentElement.getChildCount() == 3 ? currentElement.getChildAt(2) : null;
+
+ // If the inner element can only match zero, we do not need to insert any decimal points. Add them as is.
+ if (matchesZeroOnly(innerElement)) {
+ addAllCurrentToNodes();
+ } else {
+ // Get the group's children with a decimal point inserted where appropriate. Require the decimal point to be optional.
+ List nodes = getRepetitionQuantifiedElements(innerElement, innerQuantifier, innerQuestionMark, true);
+ GroupNode groupNode = new GroupNode();
+ groupNode.addChildren(nodes);
+ this.nodes.add(groupNode);
+ this.nodes.add(new QuestionMarkNode());
+ }
+ }
+
+ private void addEndingAlternationsGroup() {
+ // The current element is a group with an alternation child that has expressions that we may need to add decimal points to.
+ AlternationNode alternation = new AlternationNode();
+ for (Node expression : currentElement.getFirstChild().getChildren()) {
+ NodeListIterator expressionIter = expression.getChildrenIterator();
+ DecimalPointAdder adder = new DecimalPointAdder(expressionIter, zeroChar, addedAnyDecimalPoints);
+ List nodes = adder.addDecimalPoints();
+ while (expressionIter.hasNext()) {
+ nodes.add(expressionIter.next());
+ }
+ ExpressionNode newExpression = new ExpressionNode(nodes);
+ alternation.addChild(newExpression);
+ }
+ this.nodes.add(new GroupNode(alternation));
+ }
+
+ /**
+ * Add a decimal point based on a current element that is not quantified.
+ */
+ private void addNonQuantifiedElement() {
+ // Add the current nodes.
+ addCurrentElementToNodes();
+ addCurrentQuestionMarkToNodes();
+
+ // If this is the last element in the regex expression, do not add any decimal points.
+ if (!iter.hasNext()) {
+ return;
+ }
+
+ // Add a decimal point.
+ addDecimalPointToNodes();
+
+ if (currentQuestionMark != null) {
+ // If the current element is optional, make the decimal point optional.
+ addQuestionMarkToNodes();
+ } else {
+ // Otherwise if we have added any optional decimal points before this one, or the remaining pattern can be zero-length, make the decimal point
+ // optional.
+ if (addedAnyDecimalPoints || remainingPatternCanBeZeroLength()) {
+ addQuestionMarkToNodes();
+ }
+ // Stop adding more decimal points.
+ addMoreDecimalPoints = false;
+ }
+ }
+
+ /**
+ * Add decimal points based on a current element that is quantified.
+ */
+ private void addQuantifiedElement() {
+ switch (currentQuantifier.getType()) {
+ case ZERO_OR_MORE:
+ // Add decimal point for quantifier *.
+ addZeroOrMoreQuantifiedElement();
+ break;
+ case ONE_OR_MORE:
+ // Add decimal point for quantifier +.
+ addOneOrMoreQuantifiedElement();
+ break;
+ case REPETITION:
+ // Add decimal point for quantifier {x}.
+ this.nodes.addAll(getRepetitionQuantifiedElements(currentElement, currentQuantifier, currentQuestionMark, false));
+ break;
+ }
+ }
+
+ /**
+ * Add a decimal point for a current element that is followed by *.
+ */
+ private void addZeroOrMoreQuantifiedElement() {
+ // If the current element is a wildcard, we're looking at .* and can add it as is.
+ if (currentElement.getType() == NodeType.ANY_CHAR) {
+ addAllCurrentToNodes();
+ } else {
+ // Add an optional variant of the current element.
+ addCurrentElementToNodes();
+ addQuestionMarkToNodes();
+ // Add an optional decimal point.
+ addDecimalPointToNodes();
+ addQuestionMarkToNodes();
+ // Add the current element again, followed by the current quantifier and optional.
+ addAllCurrentToNodes();
+ }
+ }
+
+ /**
+ * Add a decimal point for a current element that is followed by +.
+ */
+ private void addOneOrMoreQuantifiedElement() {
+ // Add the current element, non-optional.
+ addCurrentElementToNodes();
+ // Add an optional decimal point.
+ addDecimalPointToNodes();
+ addQuestionMarkToNodes();
+ // Add the current element again, but this time followed by a *, as well as the current optional.
+ addCurrentElementToNodes();
+ nodes.add(new ZeroOrMoreNode());
+ addCurrentQuestionMarkToNodes();
+ // Do not add any more decimal points after this.
+ addMoreDecimalPoints = false;
+ }
+
+ /**
+ * Add decimal points for a current element that is followed by a repetition.
+ */
+ private List getRepetitionQuantifiedElements(Node element, Node quantifier, Node questionMark, boolean makeDecimalOptional) {
+ List nodes = new ArrayList<>();
+
+ // Add an initial copy of the current element.
+ nodes.add(copy(element));
+
+ // Get the repetition range from the quantifier node.
+ Pair repetitionRange = getRepetitionAsRange((RepetitionNode) quantifier);
+ boolean elementMarkedOptional = false;
+ if (repetitionRange.getLeft() == 0) {
+ // If the repetition range starts with 0, either {0,} or {0,x}, make the first occurrence of the element optional.
+ nodes.add(new QuestionMarkNode());
+ elementMarkedOptional = true;
+ }
+
+ // Subtract one from both endpoints of the repetition since we have added an initial single copy of the element to the nodes already. What we do
+ // next will depend on what the updated repetition range now covers.
+ repetitionRange = subtractOneFrom(repetitionRange);
+
+ // The new repetition range is {0,}, which is equivalent to *.
+ if (repetitionRange.getLeft() == 0 && repetitionRange.getRight() == null) {
+ nodes.add(createDecimalPoint());
+ nodes.add(new QuestionMarkNode());
+ nodes.add(copy(element));
+ nodes.add(new ZeroOrMoreNode());
+ if (questionMark != null) {
+ nodes.add(copy(questionMark));
+ }
+ } else if (repetitionRange.getLeft() == 1 && repetitionRange.getRight() == null) {
+ // The new repetition range is {1,}, which is equivalent to +.
+ nodes.add(createDecimalPoint());
+ if (makeDecimalOptional) {
+ nodes.add(new QuestionMarkNode());
+ }
+ nodes.add(copy(element));
+ nodes.add(new OneOrMoreNode());
+ if (questionMark != null) {
+ nodes.add(copy(questionMark));
+ }
+ } else if (repetitionRange.getRight() == null) {
+ // The new repetition range is {x,}.
+ nodes.add(createDecimalPoint());
+ if (makeDecimalOptional) {
+ nodes.add(new QuestionMarkNode());
+ }
+ nodes.add(copy(element));
+ nodes.add(createRepetition(repetitionRange));
+ if (questionMark != null) {
+ nodes.add(copy(questionMark));
+ }
+ } else if (repetitionRange.getLeft() == 0 && repetitionRange.getRight() > 0) {
+ // The new repetition range is {0,x}.
+ nodes.add(createDecimalPoint());
+ // If either we're looking at an optional group, or we have added any decimal points before, or we have not seen a non-leading zero,
+ // or there is only one more element, or the remaining pattern can be zero-length, make the decimal point optional.
+ if (iter.hasNext()) {
+ if (makeDecimalOptional || addedAnyDecimalPoints || !nonLeadingZeroSeen || remainingPatternCanBeZeroLength()
+ || remainingPatternHasOnlyOneMoreElement()) {
+ nodes.add(new QuestionMarkNode());
+ }
+ } else {
+ nodes.add(new QuestionMarkNode());
+ }
+ nodes.add(copy(element));
+ nodes.add(createRepetition(repetitionRange));
+ if (questionMark != null) {
+ nodes.add(copy(questionMark));
+ }
+ } else if (repetitionRange.getLeft() == 1 && repetitionRange.getRight() == 1) {
+ // The new repetition range is {1,1}. Another instance of the element can be added without a repetition after it.
+ nodes.add(createDecimalPoint());
+ if (makeDecimalOptional) {
+ nodes.add(new QuestionMarkNode());
+ }
+ nodes.add(copy(element));
+ } else if (repetitionRange.getLeft() > 0 || repetitionRange.getRight() > 0) {
+ // The new repetition range is {x,y}. Add an instance of the element with the repetition after it.
+ nodes.add(createDecimalPoint());
+ if (makeDecimalOptional) {
+ nodes.add(new QuestionMarkNode());
+ }
+ nodes.add(copy(element));
+ nodes.add(createRepetition(repetitionRange));
+ if (questionMark != null) {
+ nodes.add(copy(questionMark));
+ }
+ } else if (repetitionRange.getLeft() == 0 && repetitionRange.getRight() == 0) {
+ // The new repetition range is {0,0}. Do not add another instance of the element. If the remaining pattern cam be zero-length, or the first
+ // instance of the element was marked optional, make the decimal point optional.
+ if (iter.hasNext()) {
+ nodes.add(createDecimalPoint());
+ if (makeDecimalOptional || remainingPatternCanBeZeroLength() || elementMarkedOptional) {
+ nodes.add(new QuestionMarkNode());
+ }
+ }
+ }
+ if (nonLeadingZeroSeen) {
+ addMoreDecimalPoints = false;
+ }
+ return nodes;
+ }
+
+ /**
+ * Add a copy of {@link #currentElement} to the node list.
+ */
+ private void addCurrentElementToNodes() {
+ nodes.add(copy(currentElement));
+ }
+
+ /**
+ * Add a copy of {@link #currentQuantifier} to the node list if it is not null.
+ */
+ private void addCurrentQuantifierToNodes() {
+ if (currentQuantifier != null) {
+ nodes.add(copy(currentQuantifier));
+ }
+ }
+
+ /**
+ * Add a copy of {@link #currentQuestionMark} to the node list if it is not null.
+ */
+ private void addCurrentQuestionMarkToNodes() {
+ if (currentQuestionMark != null) {
+ nodes.add(copy(currentQuestionMark));
+ }
+ }
+
+ /**
+ * Add the current element, quantifier, and question mark to the node list.
+ */
+ private void addAllCurrentToNodes() {
+ addCurrentElementToNodes();
+ addCurrentQuantifierToNodes();
+ addCurrentQuestionMarkToNodes();
+ }
+
+ /**
+ * Add a new {@code "\."} to the node list.
+ */
+ private void addDecimalPointToNodes() {
+ nodes.add(createDecimalPoint());
+ }
+
+ /**
+ * Return a new escaped decimal point as a node.
+ */
+ private Node createDecimalPoint() {
+ return new EscapedSingleCharNode(RegexConstants.PERIOD);
+ }
+
+ /**
+ * Add a new {@code "?"} to the node list.
+ */
+ private void addQuestionMarkToNodes() {
+ nodes.add(new QuestionMarkNode());
+ }
+
+ /**
+ * Return whether all remaining elements in the iterator can either occur zero times or match a zero.
+ *
+ * @return true if the remaining pattern can be zero-length, or false otherwise.
+ */
+ private boolean remainingPatternCanBeZeroLength() {
+ // Mark the original index so that we can reset the iterator before exiting this method.
+ int originalIndex = iter.index();
+
+ // Seek past all zero-matching elements.
+ iter.seekPastZeroMatchingElements();
+
+ boolean canBeZeroLength = true;
+ while (iter.hasNext()) {
+ Node next = iter.next();
+ // If the next element can match zero, it could be a trailing zero that would get trimmed from encoded numbers.
+ if (matchesZero(next)) {
+ iter.seekPastQuantifiers();
+ iter.seekPastQuestionMarks();
+ } else {
+ // If the next element cannot match zero, it could still occur zero times based on its quantifier (if present).
+ if (iter.hasNext() && iter.isNextQuantifier()) {
+ Node quantifier = iter.next();
+ if (quantifier instanceof OneOrMoreNode) {
+ // If the element is followed by +, it must occur at least once. Remaining pattern cannot be zero-length.
+ canBeZeroLength = false;
+ break;
+ } else if (quantifier instanceof RepetitionNode) {
+ // If the remaining element is not followed by repetition variation of {0} or {0,x}, it cannot occur zero times. Remaining pattern
+ // cannot be zero-length.
+ if (!RegexUtils.repetitionCanOccurZeroTimes((RepetitionNode) quantifier)) {
+ canBeZeroLength = false;
+ break;
+ }
+ }
+ } else {
+ // If there is no quantifier, then the current element must occur. Remaining pattern cannot be zero-length.
+ canBeZeroLength = false;
+ break;
+ }
+ }
+ }
+ iter.setIndex(originalIndex);
+ return canBeZeroLength;
+ }
+
+ /**
+ * Return whether only one more element (possibly quantified and/or optional) remains in the iterator.
+ */
+ private boolean remainingPatternHasOnlyOneMoreElement() {
+ int originalIndex = iter.index();
+ iter.next();
+ iter.seekPastQuantifiers();
+ iter.seekPastQuestionMarks();
+ boolean hasOnlyOneMore = !iter.hasNext();
+ iter.setIndex(originalIndex);
+ return hasOnlyOneMore;
+ }
+
+ private boolean matchesZero(Node node) {
+ return RegexUtils.matchesChar(node, zeroChar);
+ }
+
+ private boolean matchesZeroOnly(Node node) {
+ return RegexUtils.matchesCharOnly(node, zeroChar);
+ }
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/DecimalPointValidator.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/DecimalPointValidator.java
new file mode 100644
index 00000000000..8254f34821a
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/DecimalPointValidator.java
@@ -0,0 +1,58 @@
+package datawave.data.normalizer.regex.visitor;
+
+import datawave.data.normalizer.regex.AlternationNode;
+import datawave.data.normalizer.regex.ExpressionNode;
+import datawave.data.normalizer.regex.Node;
+import datawave.data.normalizer.regex.NodeListIterator;
+import datawave.data.normalizer.regex.RegexUtils;
+
+/**
+ * Implementation of {@link BaseVisitor} that accepts a {@link Node} tree and verifies that each alternated expression does not contain more than one decimal
+ * point.
+ */
+public class DecimalPointValidator extends BaseVisitor {
+
+ public static void validate(Node node) {
+ if (node != null) {
+ DecimalPointValidator visitor = new DecimalPointValidator();
+ node.accept(visitor, null);
+ }
+ }
+
+ @Override
+ public Object visitExpression(ExpressionNode node, Object data) {
+ if (node.getFirstChild() instanceof AlternationNode) {
+ return super.visitExpression(node, data);
+ } else {
+ checkForInvalidDecimalPoints(node);
+ }
+ return null;
+ }
+
+ /**
+ * Check the given expressions for valid decimal point specifications.
+ *
+ * @param node
+ * the node to validate
+ */
+ private void checkForInvalidDecimalPoints(Node node) {
+ boolean decimalPointSeen = false;
+ NodeListIterator iter = node.getChildrenIterator();
+ // Iterate through each element.
+ while (iter.hasNext()) {
+ // Get the next element.
+ Node next = iter.next();
+ // If the current element is a decimal point, validate it.
+ if (RegexUtils.isDecimalPoint(next)) {
+ if (decimalPointSeen) {
+ throw new IllegalArgumentException("Regex may not contain expressions with than one decimal point.");
+ } else {
+ decimalPointSeen = true;
+ }
+ }
+ // Skip past any quantifiers or optionals if specified.
+ iter.seekPastQuantifiers();
+ iter.seekPastQuantifiers();
+ }
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/EmptyLeafTrimmer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/EmptyLeafTrimmer.java
new file mode 100644
index 00000000000..0ef9cecbed9
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/EmptyLeafTrimmer.java
@@ -0,0 +1,79 @@
+package datawave.data.normalizer.regex.visitor;
+
+import datawave.data.normalizer.regex.AlternationNode;
+import datawave.data.normalizer.regex.EmptyNode;
+import datawave.data.normalizer.regex.ExpressionNode;
+import datawave.data.normalizer.regex.GroupNode;
+import datawave.data.normalizer.regex.Node;
+
+/**
+ * Implementation of {@link CopyVisitor} that will return a copy of the tree trimmed such that the following modifications are made to it:
+ *
+ * - Remove all {@link EmptyNode} instances.
+ * - Remove all {@link GroupNode} instances that subsequently have no children.
+ * - Remove all {@link AlternationNode} instances that subsequently have one or no children. In the case of one child, the child will replace the
+ * {@link AlternationNode}.
+ * - Remove all {@link ExpressionNode} instances that subsequently have an {@link ExpressionNode} child.
+ *
+ * See the following examples:
+ *
+ * - Input {@code "3||4||5"} will return {@code "3|4|5"}
+ * - Input {@code "3|()"} will return {@code "3"}
+ * - Input {@code "()|()"} will return {@code null}
+ *
+ */
+public class EmptyLeafTrimmer extends CopyVisitor {
+
+ /**
+ * Return a copy of the given tree trimmed of empty nodes. If the entire tree is trimmed, null will be returned, otherwise a {@link ExpressionNode} with the
+ * trimmed tree will be returned.
+ *
+ * @param node
+ * the node to trim
+ * @return the trimmed node
+ */
+ public static Node trim(Node node) {
+ if (node == null) {
+ return null;
+ }
+ EmptyLeafTrimmer visitor = new EmptyLeafTrimmer();
+ return (Node) node.accept(visitor, null);
+ }
+
+ @Override
+ public Object visitExpression(ExpressionNode node, Object data) {
+ Node copy = (Node) super.visitExpression(node, data);
+ if (copy.isLeaf()) {
+ return null;
+ } else if (copy.getChildCount() == 1) {
+ Node child = copy.getFirstChild();
+ if (child instanceof ExpressionNode) {
+ return child;
+ }
+ }
+ return copy;
+ }
+
+ @Override
+ public Object visitAlternation(AlternationNode node, Object data) {
+ Node copy = (Node) super.visitAlternation(node, data);
+ if (copy.isLeaf()) {
+ return null;
+ } else if (copy.getChildCount() == 1) {
+ return copy.getFirstChild();
+ } else {
+ return copy;
+ }
+ }
+
+ @Override
+ public Object visitGroup(GroupNode node, Object data) {
+ Node copy = (Node) super.visitGroup(node, data);
+ return copy.isLeaf() ? null : copy;
+ }
+
+ @Override
+ public Object visitEmpty(EmptyNode node, Object data) {
+ return null;
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/EqualityVisitor.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/EqualityVisitor.java
new file mode 100644
index 00000000000..b3291346593
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/EqualityVisitor.java
@@ -0,0 +1,169 @@
+package datawave.data.normalizer.regex.visitor;
+
+import datawave.data.normalizer.regex.AlternationNode;
+import datawave.data.normalizer.regex.AnyCharNode;
+import datawave.data.normalizer.regex.CharClassNode;
+import datawave.data.normalizer.regex.CharRangeNode;
+import datawave.data.normalizer.regex.DigitCharClassNode;
+import datawave.data.normalizer.regex.EmptyNode;
+import datawave.data.normalizer.regex.EncodedNumberNode;
+import datawave.data.normalizer.regex.EncodedPatternNode;
+import datawave.data.normalizer.regex.EndAnchorNode;
+import datawave.data.normalizer.regex.EscapedSingleCharNode;
+import datawave.data.normalizer.regex.ExpressionNode;
+import datawave.data.normalizer.regex.GroupNode;
+import datawave.data.normalizer.regex.IntegerNode;
+import datawave.data.normalizer.regex.IntegerRangeNode;
+import datawave.data.normalizer.regex.Node;
+import datawave.data.normalizer.regex.OneOrMoreNode;
+import datawave.data.normalizer.regex.QuestionMarkNode;
+import datawave.data.normalizer.regex.RepetitionNode;
+import datawave.data.normalizer.regex.SingleCharNode;
+import datawave.data.normalizer.regex.StartAnchorNode;
+import datawave.data.normalizer.regex.ZeroOrMoreNode;
+
+/**
+ * A {@link Visitor} implementation that will compare two {@link Node} tree and determine if they are equal.
+ */
+public class EqualityVisitor implements Visitor {
+
+ /**
+ * Return whether the given {@link Node} trees are equal.
+ *
+ * @param left
+ * the left tree to compare
+ * @param right
+ * the right tree to compare
+ * @return true if the trees are equal, or false otherwise.
+ */
+ public static boolean isEqual(Node left, Node right) {
+ if (left != null && right != null) {
+ EqualityVisitor visitor = new EqualityVisitor();
+ return (boolean) left.accept(visitor, right);
+ } else {
+ return left == null && right == null;
+ }
+ }
+
+ private boolean isEqual(Node left, Object data) {
+ Node right = (Node) data;
+ // Compare the nodes.
+ if (!left.equals(right)) {
+ return false;
+ }
+ // Compare the child counts.
+ if (left.getChildCount() != right.getChildCount()) {
+ return false;
+ }
+ // Compare the children.
+ for (int index = 0; index < left.getChildCount(); index++) {
+ Node leftChild = left.getChildAt(index);
+ Node rightChild = right.getChildAt(index);
+ boolean isEqual = (boolean) leftChild.accept(this, rightChild);
+ if (!isEqual) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ @Override
+ public Object visitExpression(ExpressionNode node, Object data) {
+ return isEqual(node, data);
+ }
+
+ @Override
+ public Object visitAlternation(AlternationNode node, Object data) {
+ return isEqual(node, data);
+ }
+
+ @Override
+ public Object visitGroup(GroupNode node, Object data) {
+ return isEqual(node, data);
+ }
+
+ @Override
+ public Object visitDigitChar(DigitCharClassNode node, Object data) {
+ return isEqual(node, data);
+ }
+
+ @Override
+ public Object visitCharClass(CharClassNode node, Object data) {
+ return isEqual(node, data);
+ }
+
+ @Override
+ public Object visitCharRange(CharRangeNode node, Object data) {
+ return isEqual(node, data);
+ }
+
+ @Override
+ public Object visitSingleChar(SingleCharNode node, Object data) {
+ return isEqual(node, data);
+ }
+
+ @Override
+ public Object visitEscapedSingleChar(EscapedSingleCharNode node, Object data) {
+ return isEqual(node, data);
+ }
+
+ @Override
+ public Object visitRepetition(RepetitionNode node, Object data) {
+ return isEqual(node, data);
+ }
+
+ @Override
+ public Object visitQuestionMark(QuestionMarkNode node, Object data) {
+ return isEqual(node, data);
+ }
+
+ @Override
+ public Object visitAnyChar(AnyCharNode node, Object data) {
+ return isEqual(node, data);
+ }
+
+ @Override
+ public Object visitZeroToMany(ZeroOrMoreNode node, Object data) {
+ return isEqual(node, data);
+ }
+
+ @Override
+ public Object visitOneToMany(OneOrMoreNode node, Object data) {
+ return isEqual(node, data);
+ }
+
+ @Override
+ public Object visitInteger(IntegerNode node, Object data) {
+ return isEqual(node, data);
+ }
+
+ @Override
+ public Object visitIntegerRange(IntegerRangeNode node, Object data) {
+ return isEqual(node, data);
+ }
+
+ @Override
+ public Object visitEmpty(EmptyNode node, Object data) {
+ return isEqual(node, data);
+ }
+
+ @Override
+ public Object visitStartAnchor(StartAnchorNode node, Object data) {
+ return isEqual(node, data);
+ }
+
+ @Override
+ public Object visitEndAnchor(EndAnchorNode node, Object data) {
+ return isEqual(node, data);
+ }
+
+ @Override
+ public Object visitEncodedNumber(EncodedNumberNode node, Object data) {
+ return isEqual(node, data);
+ }
+
+ @Override
+ public Object visitEncodedPattern(EncodedPatternNode node, Object data) {
+ return isEqual(node, data);
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ExponentialBinAdder.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ExponentialBinAdder.java
new file mode 100644
index 00000000000..0884988f23e
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ExponentialBinAdder.java
@@ -0,0 +1,154 @@
+package datawave.data.normalizer.regex.visitor;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.Function;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import datawave.data.normalizer.regex.CharClassNode;
+import datawave.data.normalizer.regex.CharRangeNode;
+import datawave.data.normalizer.regex.EncodedPatternNode;
+import datawave.data.normalizer.regex.EscapedSingleCharNode;
+import datawave.data.normalizer.regex.Node;
+import datawave.data.normalizer.regex.RegexConstants;
+import datawave.data.normalizer.regex.RegexUtils;
+import datawave.data.normalizer.regex.SingleCharNode;
+import datawave.data.type.util.NumericalEncoder;
+
+/**
+ * Implementation of {@link CopyVisitor} that will return a copy of the tree where all non-simple number patterns are enriched with bin information.
+ */
+public class ExponentialBinAdder extends SubExpressionVisitor {
+
+ /**
+ * Return a copy of the given tree with all regex patterns enriched with exponential bin information.
+ *
+ * @param node
+ * the node
+ * @return the enriched node
+ */
+ public static Node addBins(Node node) {
+ if (node == null) {
+ return null;
+ }
+ ExponentialBinAdder visitor = new ExponentialBinAdder();
+ return (Node) node.accept(visitor, null);
+ }
+
+ // Retrieves bins for negative numbers.
+ private static final Function NEGATIVE_BIN_FUNCTION = NumericalEncoder::getNegativeBin;
+
+ // Retrieves bins for positive numbers.
+ private static final Function POSITIVE_BIN_FUNCTION = NumericalEncoder::getPositiveBin;
+
+ @Override
+ protected Object visitSubExpression(Node node) {
+ List binNodes = new ArrayList<>();
+ boolean negative = RegexUtils.isNegativeRegex(node);
+
+ // The bin information consist of:
+ // 1. The lead sign that indicates whether the range covers positive (\+) or negative numbers (!).
+ binNodes.add(getLeadSign(negative));
+ // 2. The range of exponential bin letters. This may either be a single bin letter, or a character class of multiple bin letters.
+ binNodes.add(getBinRange(node, negative));
+ // 3. An 'E' to separate the bin information from the beginning of the numeric regex pattern.
+ binNodes.add(new SingleCharNode(RegexConstants.CAPITAL_E));
+
+ // Return an EncodedPatternNode copy rather than an ExpressionNode.
+ EncodedPatternNode encodedPattern = new EncodedPatternNode(copy(node).getChildren());
+
+ // If we had a negative sign, remove it. We will have ! (negative) and \+ (positive) going forward.
+ if (negative) {
+ encodedPattern.removeFirstChild();
+ }
+
+ // Insert the bin information at the beginning of the pattern.
+ int insertIndex = 0;
+ for (Node binNode : binNodes) {
+ encodedPattern.addChild(binNode, insertIndex);
+ insertIndex++;
+ }
+ return encodedPattern;
+ }
+
+ /**
+ * Return {@code "\+"} if negative is false, or {@code "!"} if negative is true.
+ *
+ * @param negative
+ * whether the regex pattern matches against negative numbers.
+ * @return the lead sign
+ */
+ private Node getLeadSign(boolean negative) {
+ return negative ? new SingleCharNode(RegexConstants.EXCLAMATION_POINT) : new EscapedSingleCharNode(RegexConstants.PLUS);
+ }
+
+ /**
+ * Get the range of exponential bins that the regex pattern should cover.
+ *
+ * @param node
+ * the regex pattern
+ * @param negative
+ * whether the pattern matches against negative numbers
+ * @return the bin range, either a single bin letter or a character class of bin ranges
+ */
+ private Node getBinRange(Node node, boolean negative) {
+ // Determine what exponential bins should be included in the encoded expression.
+ // Get the bin range for numbers equal to or greater than one that the pattern can match against.
+ Pair gteOneBinRange = GTEOneBinFinder.binRangeOf(node);
+ // Get the bin range for numbers less than one that the pattern can match against.
+ Pair ltOneBinRange = LTOneBinFinder.binRangeOf(node);
+
+ // The target bin retrieval function depends on whether the pattern matches against negative numbers.
+ Function binFunction = negative ? NEGATIVE_BIN_FUNCTION : POSITIVE_BIN_FUNCTION;
+
+ if (gteOneBinRange == null) {
+ // If the regex pattern cannot match against numbers equal to or greater than one, return the bin info for numbers less than one only.
+ return buildBinFromSingleRange(ltOneBinRange, binFunction);
+ } else if (ltOneBinRange == null) {
+ // If the regex pattern cannot match against numbers less than one, return the bin info for numbers equal to or greater than one only.
+ return buildBinFromSingleRange(gteOneBinRange, binFunction);
+ } else {
+ // Otherwise, merge the bin ranges and return them.
+ CharClassNode charClass = new CharClassNode();
+ Node onePlusBin = buildBinFromSingleRange(gteOneBinRange, binFunction);
+ // If a single character was returned, add it to the character class. Otherwise, a character class with a range was returned. Add the range.
+ charClass.addChild(onePlusBin instanceof SingleCharNode ? onePlusBin : onePlusBin.getFirstChild());
+
+ Node subOneBin = buildBinFromSingleRange(ltOneBinRange, binFunction);
+ // If a single character was returned, add it to the character class. Otherwise, a character class with a range was returned. Add the range.
+ charClass.addChild(subOneBin instanceof SingleCharNode ? subOneBin : subOneBin.getFirstChild());
+ return charClass;
+ }
+ }
+
+ /**
+ * Return a bin info node for a single bin range.
+ *
+ * @param binRange
+ * the
+ * @param binFunction
+ * the delegate bin retrieval function
+ * @return the bin info
+ */
+ private Node buildBinFromSingleRange(Pair binRange, Function binFunction) {
+ if (binRange.getLeft().equals(binRange.getRight())) {
+ // We have a single bin to cover in this range. Return a single char node.
+ return new SingleCharNode(binFunction.apply(binRange.getLeft()));
+ } else {
+ // We have a range of bins to cover. Create a character class.
+ CharClassNode charClass = new CharClassNode();
+ char left = binFunction.apply(binRange.getLeft());
+ char right = binFunction.apply(binRange.getRight());
+ int compare = Character.compare(left, right);
+ // It's possible for the left sided-bin to be alphabetically higher than the right side. If so, flip them around in the character class range.
+ if (compare < 0) {
+ charClass.addChild(new CharRangeNode(left, right));
+ } else {
+ charClass.addChild(new CharRangeNode(right, left));
+ }
+ return charClass;
+ }
+ }
+
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/GTEOneBinFinder.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/GTEOneBinFinder.java
new file mode 100644
index 00000000000..1548890fecf
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/GTEOneBinFinder.java
@@ -0,0 +1,143 @@
+package datawave.data.normalizer.regex.visitor;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import datawave.data.normalizer.regex.Node;
+import datawave.data.normalizer.regex.NodeType;
+import datawave.data.normalizer.regex.RegexUtils;
+
+/**
+ * Implementation of {@link BinFinder} that finds the range of exponential bins that a regex pattern should match against for numbers equal to or greater than
+ * one.
+ */
+public class GTEOneBinFinder extends BinFinder {
+
+ private static final int MIN_BIN = 0;
+ private static final int MAX_BIN = 25;
+ private static final int INITIAL_ENDPOINT_VALUE = -1;
+
+ public static Pair binRangeOf(Node node) {
+ GTEOneBinFinder calculator = new GTEOneBinFinder(node);
+ return calculator.getBinRange();
+ }
+
+ protected GTEOneBinFinder(Node node) {
+ super(node, MIN_BIN, MAX_BIN, INITIAL_ENDPOINT_VALUE);
+ }
+
+ @Override
+ protected Pair getBinRange() {
+ calculateRange();
+ normalizeRange();
+ return getEndpoints();
+ }
+
+ /**
+ * Calculate the bin range.
+ */
+ private void calculateRange() {
+ // Skip any leading zero elements that only match a zero character.
+ childrenIter.seekPastZeroOnlyElements();
+
+ // If a decimal point is present, and we have reached it after skipping zero-only elements, there's nothing further to do.
+ if (childrenIter.index() == decimalPointIndex) {
+ return;
+ }
+
+ boolean lockedAtWildcard = false;
+ boolean nonLeadingZeroSeen = false;
+
+ // Iterate through the remaining children up to the decimal point (if present).
+ while (childrenIter.hasNext() && !(childrenIter.index() == decimalPointIndex)) {
+ Node next = childrenIter.next();
+ if (lockedAtWildcard) {
+ // If we have previously locked the lower bound at a wildcard, we do not need to make further evaluations on the current element. Update the
+ // bin range with it.
+ updateBinRange();
+ } else if (nonLeadingZeroSeen) {
+ // If the current node is a wildcard, and an explicit decimal point is not present in the regex, lock the lower bound. This will ensure we match
+ // against numbers that had a decimal point that would match against this wildcard.
+ if (decimalPointIndex == -1 && next.getType() == NodeType.ANY_CHAR) {
+ lockLower();
+ lockedAtWildcard = true;
+ }
+ // If any non-leading zero elements were seen, update the bin range with the current element. We must still check for a wildcard.
+ updateBinRange();
+ } else if (RegexUtils.matchesZeroOnly(next)) {
+ // The current element matches zero only, e.g. '0' or [0], and is part of a leading zero. Update the bin range with the current element.
+ updateBinRange();
+ } else if (RegexUtils.matchesZero(next)) {
+ // The current element can match zero and at least one other number. Reset the lower bound, and seek ahead to determine if we should lock the
+ // lower bound.
+ setLowerToInitialEndpointValue();
+ // If this leading zero is the last element that can match against any other number until the end of the regex, or until the decimal point, we
+ // must lock the lower bound here.
+ if (isRemainingZeroOnlyUntilEndOrDecimalPoint()) {
+ // The current element must occur at least once, so increment lower by one before locking it.
+ incrementLower();
+
+ // We want to update the bin range without modifying the lower bound, so lock the lower bound, update the bin range, and then unlock the
+ // lower bound. The lower bound must be unlocked afterwards to allow for any subsequent zero-only characters to be counted if seen.
+ lockLower();
+ updateBinRange();
+ unlockLower();
+ } else {
+ // Update the bin range.
+ updateBinRange();
+ }
+ } else {
+ // We've seen our first non-leading zero. Mark it so.
+ nonLeadingZeroSeen = true;
+ // Reset the lower bound before updating the bin range. Any elements we saw before this were leading zeros that can be disregarded.
+ setLowerToInitialEndpointValue();
+ updateBinRange();
+ }
+ }
+ }
+
+ /**
+ * Return whether, if skipping all elements that can only match zero, there are no more elements or the next element is a decimal point.
+ *
+ * @return true if the remaining regex pattern will match zero either until the end or a decimal point, or false otherwise
+ */
+ private boolean isRemainingZeroOnlyUntilEndOrDecimalPoint() {
+ // Make a note of the iterator's current index so that we can reset it later.
+ int originalIndex = childrenIter.index();
+
+ // Skip past any quantifiers or question marks the current element may have had.
+ childrenIter.seekPastQuantifiers();
+ childrenIter.seekPastQuestionMarks();
+
+ // Find the next node that does not only match the character '0'.
+ Node nextNonZeroOnlyNode = null;
+ while (childrenIter.hasNext()) {
+ Node next = childrenIter.next();
+
+ // If the current element does not match zero only, we've found our target node. Stop looping.
+ if (!RegexUtils.matchesZeroOnly(next)) {
+ nextNonZeroOnlyNode = next;
+ break;
+ }
+ childrenIter.seekPastQuantifiers();
+ childrenIter.seekPastQuestionMarks();
+ }
+ // Reset the iterator to the original index.
+ childrenIter.setIndex(originalIndex);
+
+ return nextNonZeroOnlyNode == null || RegexUtils.isDecimalPoint(nextNonZeroOnlyNode);
+ }
+
+ /**
+ * Update the bin range with the current element, taking into account any specified quantifiers.
+ */
+ private void updateBinRange() {
+ if (childrenIter.hasNext() && childrenIter.isNextQuantifier()) {
+ // If a quantifier was specified, increment the upper and lower bound based on the quantifier type.
+ updateRangeWithNextQuantifier();
+ } else {
+ // If no quantifier was specified, increment the upper and lower bound by one.
+ incrementUpper();
+ incrementLower();
+ }
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/LTOneBinFinder.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/LTOneBinFinder.java
new file mode 100644
index 00000000000..71496d08de4
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/LTOneBinFinder.java
@@ -0,0 +1,143 @@
+package datawave.data.normalizer.regex.visitor;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import datawave.data.normalizer.regex.AnyCharNode;
+import datawave.data.normalizer.regex.Node;
+import datawave.data.normalizer.regex.RegexUtils;
+
+/**
+ * Implementation of {@link BinFinder} that finds the range of exponential bins that a regex pattern should match against for numbers less than one.
+ */
+public class LTOneBinFinder extends BinFinder {
+
+ private static final int MAX_BIN = 26;
+ private static final int MIN_BIN = 1;
+ private static final int INITIAL_ENDPOINT_VALUE = 0;
+
+ public static Pair binRangeOf(Node node) {
+ LTOneBinFinder calculator = new LTOneBinFinder(node);
+ return calculator.getBinRange();
+ }
+
+ protected LTOneBinFinder(Node node) {
+ super(node, MIN_BIN, MAX_BIN, INITIAL_ENDPOINT_VALUE);
+ }
+
+ @Override
+ protected Pair getBinRange() {
+ if (decimalPointIndex == -1) {
+ calculateRangeWithoutDecimalPoint();
+ } else {
+ calculateRangeWithDecimalPoint();
+ }
+ normalizeRange();
+
+ // When retrieving bins for numbers less than one, the bin values must be negative. Negate the endpoints.
+ lower = -lower;
+ upper = -upper;
+
+ return getEndpoints();
+ }
+
+ /**
+ * Calculate the bin range for a pattern that has no decimal point specified in it.
+ */
+ private void calculateRangeWithoutDecimalPoint() {
+ // Get the index of the first wildcard in the regex, if present.
+ int firstWildcardIndex = node.indexOf(AnyCharNode.class);
+
+ // If there is no wildcard present in the regex, the regex does not need a bin range for numbers less than one.
+ if (firstWildcardIndex == -1) {
+ return;
+ }
+
+ // If there are any elements before the wildcard, they must all be able to possibly be a leading zero up to the wildcard. If not, the pattern will not
+ // match against numbers less than one and does not need a bin range for numbers less than one.
+ while (childrenIter.index() != firstWildcardIndex) {
+ Node next = childrenIter.peekNext();
+ // We found an element that cannot match zero before the wildcard. Return early.
+ if (!RegexUtils.matchesZero(next)) {
+ return;
+ } else {
+ // We found an element that can match zero. Move the iterator forward, and skip any quantifiers or question marks.
+ childrenIter.next();
+ childrenIter.seekPastQuantifiers();
+ childrenIter.seekPastQuestionMarks();
+ }
+ }
+
+ // Skip over the first wildcard, capture any quantifier if present, and skip past any question marks.
+ childrenIter.next();
+ Node quantifier = childrenIter.isNextQuantifier() ? childrenIter.next() : null;
+ childrenIter.seekPastQuestionMarks();
+
+ // If there are no elements after the wildcard, and the wildcard did not have a quantifier, there is nothing more to do.
+ if (!childrenIter.hasNext() && quantifier == null) {
+ return;
+ }
+
+ // Otherwise we will at least have the minimum bin range possible.
+ incrementLower();
+ incrementUpper();
+
+ // If the first wildcard had a quantifier, lock the lower bound and update the upper bound based on the quantifier.
+ if (quantifier != null) {
+ lockLower();
+ updateRangeWithQuantifier(quantifier);
+ }
+
+ // Process the remaining children.
+ processRemainingChildren();
+ }
+
+ /**
+ * Calculate the bin range for a pattern with a decimal point in it.
+ */
+ private void calculateRangeWithDecimalPoint() {
+ // Seek past children that can match the character '0'. If the next child after this is not the decimal point, then the regex expression will not
+ // match against numbers less than one.
+ childrenIter.seekPastZeroMatchingElements();
+ if (childrenIter.index() != decimalPointIndex) {
+ return;
+ }
+
+ // Skip over the decimal point to the next character.
+ childrenIter.next();
+ // We will at least have the minimum bin range possible.
+ incrementUpper();
+ incrementLower();
+
+ // Process the remaining children.
+ processRemainingChildren();
+ }
+
+ /**
+ * Iterate over the remaining children in the children iterator and update the bin range.
+ */
+ private void processRemainingChildren() {
+ // For each possible leading zero after the decimal point, update the bin range.
+ while (childrenIter.hasNext()) {
+ Node next = childrenIter.next();
+ // If next can be a leading zero, update the range.
+ if (RegexUtils.matchesZero(next)) {
+ // If next can possible be not a zero, lock the lower bound.
+ if (!RegexUtils.matchesZeroOnly(next)) {
+ lockLower();
+ }
+
+ // If the element has a quantifier, increment the upper and lower bound based on the quantifier.
+ if (childrenIter.isNextQuantifier()) {
+ updateRangeWithNextQuantifier();
+ } else {
+ // Otherwise increment the upper and lower bound by one.
+ incrementLower();
+ incrementUpper();
+ }
+ } else {
+ // If next cannot possibly be a leading zero, there is nothing more to do.
+ return;
+ }
+ }
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NegativeNumberPatternInverter.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NegativeNumberPatternInverter.java
new file mode 100644
index 00000000000..3f1d7c0ff12
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NegativeNumberPatternInverter.java
@@ -0,0 +1,567 @@
+package datawave.data.normalizer.regex.visitor;
+
+import static datawave.data.normalizer.regex.RegexUtils.toChar;
+import static datawave.data.normalizer.regex.RegexUtils.toInt;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import datawave.data.normalizer.regex.AlternationNode;
+import datawave.data.normalizer.regex.CharClassNode;
+import datawave.data.normalizer.regex.CharRangeNode;
+import datawave.data.normalizer.regex.EncodedPatternNode;
+import datawave.data.normalizer.regex.ExpressionNode;
+import datawave.data.normalizer.regex.GroupNode;
+import datawave.data.normalizer.regex.Node;
+import datawave.data.normalizer.regex.NodeListIterator;
+import datawave.data.normalizer.regex.NodeType;
+import datawave.data.normalizer.regex.OneOrMoreNode;
+import datawave.data.normalizer.regex.RegexConstants;
+import datawave.data.normalizer.regex.RegexUtils;
+import datawave.data.normalizer.regex.RepetitionNode;
+import datawave.data.normalizer.regex.SingleCharNode;
+import datawave.data.normalizer.regex.ZeroOrMoreNode;
+
+/**
+ * Implementation of {@link CopyVisitor} that will return a copy of a regex tree with all patterns that are meant to match negative numbers inverted such that
+ * they will match against negative numbers that were encoded by {@link datawave.data.type.util.NumericalEncoder}. The numerical encoder encodes negative
+ * numbers such that the mantissa equals ten minus the mantissa of scientific notation.
+ *
+ * @see datawave.data.type.util.NumericalEncoder
+ */
+public class NegativeNumberPatternInverter extends CopyVisitor {
+
+ private static final int TEN = 10;
+ private static final int NINE = 9;
+
+ public static Node invert(Node node) {
+ if (node == null) {
+ return null;
+ }
+
+ NegativeNumberPatternInverter visitor = new NegativeNumberPatternInverter();
+ return (Node) node.accept(visitor, null);
+ }
+
+ @Override
+ public Object visitEncodedPattern(EncodedPatternNode node, Object data) {
+ // Operate on a copy of the pattern tree.
+ Node copy = copy(node);
+
+ // If the first character is not !, this is not a negative number pattern. Return the copy.
+ if (!RegexUtils.isChar(copy.getFirstChild(), RegexConstants.EXCLAMATION_POINT)) {
+ return copy;
+ }
+
+ // Create an initial encoded pattern node with all the leading bin info.
+ EncodedPatternNode encodedPattern = new EncodedPatternNode();
+ List children = copy.getChildren();
+ int startOfNodesToInvert = 0;
+ for (Node child : children) {
+ startOfNodesToInvert++;
+ encodedPattern.addChild(child);
+ if (RegexUtils.isChar(child, RegexConstants.CAPITAL_E)) {
+ break;
+ }
+ }
+
+ // Invert the remaining nodes and add them to the encoded pattern node.
+ List nodesToInvert = new ArrayList<>(children.subList(startOfNodesToInvert, children.size()));
+ encodedPattern.addChildren(new PatternInverter(nodesToInvert).invert());
+ return encodedPattern;
+ }
+
+ private static class PatternInverter {
+
+ // The node iterator.
+ protected final NodeListIterator iter;
+
+ // The currently inverted nodes.
+ protected final List inverted = new ArrayList<>();
+
+ // The most recent element.
+ protected Node currentElement;
+
+ // The most recent quantifier.
+ protected Node currentQuantifier;
+
+ // The most recent question mark.
+ protected Node currentQuestionMark;
+
+ public PatternInverter(List nodes) {
+ Collections.reverse(nodes);
+ this.iter = new NodeListIterator(nodes);
+ }
+
+ public List invert() {
+ invertEndingPermutations();
+ while (iter.hasNext()) {
+ captureNext();
+ inverted.addAll(subtractCurrentFromNine(false));
+ }
+ Collections.reverse(inverted);
+ return inverted;
+ }
+
+ private void invertEndingPermutations() {
+ // Fetch the first element.
+ captureNext();
+
+ // If the first element can occur zero times, e.g. it could match the '0' character (which would not show up in an encoded number), or it has a
+ // quantifier that allows for zero occurrences, e.g. {0,4}, then we must identify all possible trailing elements that may not occur, and create
+ // ending permutations that allow for the possibility of each successive element not occurring. The last element of each permutation must be
+ // inverted with a minuend of 10, and any preceding elements must be inverted with a minuend of 9.
+ if (currentCanOccurZeroTimes()) {
+ List> permutations = new ArrayList<>();
+ // Add a permutation of the first element inverted with a minuend of 10.
+ permutations.add(subtractCurrentFromTen());
+ // Examine all remaining elements until we find one that must occur at least once.
+ while (iter.hasNext()) {
+ captureNext();
+ // Add a variant of the current element inverted with a minuend of 9 to all existing permutations.
+ List subtractedFromNine = subtractCurrentFromNine(true);
+ for (List permutation : permutations) {
+ permutation.addAll(subtractedFromNine);
+ }
+ // If the current element does not match only the '0' character, add a new permutation with a variant of the current element inverted with a
+ // minuend of 10.
+ if (!currentMatchesZeroOnly()) {
+ permutations.add(0, subtractCurrentFromTen());
+ }
+ if (!currentCanOccurZeroTimes()) {
+ break;
+ }
+ }
+ if (permutations.size() == 1) {
+ // If we only have one permutation, the pattern was only one element long, e.g. "\d". Add the sole permutation to the inverted nodes list.
+ inverted.addAll(permutations.get(0));
+ } else {
+ // If we have multiple permutations, we need to create alternations of these permutations, and wrap them in a group.
+ // Sort the alternations from shortest to longest.
+ AlternationNode alternation = new AlternationNode();
+ for (List permutation : permutations) {
+ // Reverse the nodes in the permutation to restore the correct order.
+ Collections.reverse(permutation);
+ // Add the permutation as an expression to the alternation node.
+ alternation.addChild(new ExpressionNode(permutation));
+ }
+ // Wrap the alternation in a group before adding it to the inverted nodes list.
+ inverted.add(new GroupNode(alternation));
+ }
+
+ } else {
+ // The last-most element must occur at least once, and cannot match the character '0'. Invert it with a minuend of 10, and add it to the
+ // inverted nodes list.
+ inverted.addAll(subtractCurrentFromTen());
+ }
+ }
+
+ /**
+ * Return whether the current element represents something that may match against a trailing zero, or may occur zero times.
+ *
+ * @return whether the current element could occur zero times in target matches
+ */
+ private boolean currentCanOccurZeroTimes() {
+ if (currentElement.getType() != NodeType.GROUP) {
+ return RegexUtils.matchesZero(currentElement) || (currentQuantifier != null && RegexUtils.canOccurZeroTimes(currentQuantifier));
+ } else {
+ NodeListIterator groupIter = currentElement.getChildrenIterator();
+ Node targetElement = groupIter.next();
+ if (RegexUtils.matchesZero(targetElement)) {
+ return true;
+ } else {
+ if (groupIter.isNextQuantifier()) {
+ return RegexUtils.canOccurZeroTimes(groupIter.next());
+ }
+ return false;
+ }
+ }
+ }
+
+ private boolean currentMatchesZeroOnly() {
+ if (currentElement.getType() != NodeType.GROUP) {
+ return RegexUtils.matchesZeroOnly(currentElement);
+ } else {
+ return RegexUtils.matchesZeroOnly(currentElement.getFirstChild());
+ }
+ }
+
+ /**
+ * Return the current element inverted with a minuend of 10.
+ *
+ * @return the inverted nodes.
+ */
+ private List subtractCurrentFromTen() {
+ return ElementInverter.forType(currentElement).subtractFromTen(currentElement, currentQuantifier, currentQuestionMark, true);
+ }
+
+ /**
+ * Return the current element inverted with a minuend of 9.
+ *
+ * @param endingElement
+ * whether the current element is an ending permutation element
+ * @return the inverted nodes
+ */
+ private List subtractCurrentFromNine(boolean endingElement) {
+ return ElementInverter.forType(currentElement).subtractFromNine(currentElement, currentQuantifier, currentQuestionMark, endingElement);
+ }
+
+ /**
+ * Capture the next element, quantifier, and current question mark.
+ */
+ protected void captureNext() {
+ // Reset the current elements to null.
+ setCurrentToNull();
+
+ // Extract the next element, quantifier, and question mark if present.
+ while (iter.hasNext()) {
+ if (iter.isNextQuestionMark()) {
+ currentQuestionMark = iter.next();
+ } else if (iter.isNextQuantifier()) {
+ currentQuantifier = iter.next();
+ } else {
+ currentElement = iter.next();
+ break;
+ }
+ }
+ }
+
+ /**
+ * Set the current element, quantifier, and question mark to null.
+ */
+ protected void setCurrentToNull() {
+ currentElement = null;
+ currentQuantifier = null;
+ currentQuestionMark = null;
+ }
+ }
+
+ private interface ElementInverter {
+
+ ElementInverter NON_MODIFYING_INVERTER = new NonModifyingInverter();
+ ElementInverter SINGLE_CHAR_INVERTER = new SingleCharInverter();
+ ElementInverter CHAR_CLASS_INVERTER = new CharClassInverter();
+ ElementInverter GROUP_INVERTER = new GroupInverter();
+
+ /**
+ * Return the appropriate {@link ElementInverter} for the element's type.
+ *
+ * @param element
+ * the element
+ * @return the inverter
+ */
+ static ElementInverter forType(Node element) {
+ switch (element.getType()) {
+ case ESCAPED_SINGLE_CHAR:
+ case ANY_CHAR:
+ case DIGIT_CHAR_CLASS:
+ return NON_MODIFYING_INVERTER;
+ case SINGLE_CHAR:
+ return SINGLE_CHAR_INVERTER;
+ case CHAR_CLASS:
+ return CHAR_CLASS_INVERTER;
+ case GROUP:
+ return GROUP_INVERTER;
+ default:
+ throw new IllegalArgumentException("Unhandled element type " + element.getType());
+ }
+ }
+
+ List subtractFromNine(Node element, Node quantifier, Node questionMark, boolean endingElement);
+
+ List subtractFromTen(Node element, Node quantifier, Node questionMark, boolean endingElement);
+ }
+
+ /**
+ * Abstract implementation of {@link ElementInverter} with some shared functionality.
+ */
+ private static abstract class AbstractInverter implements ElementInverter {
+
+ protected List asList(Node... nodes) {
+ List list = new ArrayList<>();
+ for (Node node : nodes) {
+ if (node != null) {
+ list.add(node);
+ }
+ }
+ return list;
+ }
+
+ protected SingleCharNode subtractSingleCharFrom(SingleCharNode node, int minuend) {
+ char digit = node.getCharacter();
+ int value = minuend - RegexUtils.toInt(digit);
+ return value < 10 ? new SingleCharNode(RegexUtils.toChar(value)) : null;
+ }
+
+ }
+
+ /**
+ * Handles elements that do not need to go through inversion, like wildcards or the digit character class {@code \d}.
+ */
+ private static class NonModifyingInverter extends AbstractInverter {
+
+ @Override
+ public List subtractFromNine(Node element, Node quantifier, Node questionMark, boolean endingElement) {
+ // If this is an ending permutation element, and the element is marked optional, make it non-optional.
+ if (endingElement && quantifier == null && questionMark != null) {
+ return asList(element);
+ }
+ // Return the elements in reverse order.
+ return asList(questionMark, quantifier, element);
+ }
+
+ @Override
+ public List subtractFromTen(Node element, Node quantifier, Node questionMark, boolean endingElement) {
+ // If this is an ending permutation element, and the element is marked optional, make it non-optional.
+ if (endingElement && quantifier == null && questionMark != null) {
+ return asList(element);
+ }
+ // If the quantifier is *, change it to + to require at least one occurrence.
+ if (quantifier != null && quantifier.getType() == NodeType.ZERO_OR_MORE) {
+ quantifier = new OneOrMoreNode();
+ }
+ // Return the elements in reverse order.
+ return asList(questionMark, quantifier, element);
+ }
+
+ }
+
+ /**
+ * Handles inverting single characters.
+ */
+ private static class SingleCharInverter extends AbstractInverter {
+
+ /**
+ * Return the given element inverted with a minuend of nine.
+ */
+ @Override
+ public List subtractFromNine(Node element, Node quantifier, Node questionMark, boolean endingElement) {
+ // Subtract the given digit char from 9.
+ Node newElement = subtractSingleCharFrom((SingleCharNode) element, NINE);
+ // If this is an ending permutation element, and the element is marked optional, make it non-optional.
+ if (endingElement && quantifier == null && questionMark != null) {
+ return asList(newElement);
+ }
+ // Return the elements in reverse order.
+ return asList(questionMark, quantifier, newElement);
+ }
+
+ /**
+ * Return the given char inverted with a minuend of ten.
+ */
+ @Override
+ public List subtractFromTen(Node element, Node quantifier, Node questionMark, boolean endingElement) {
+ Node fromTen = subtractSingleCharFrom((SingleCharNode) element, TEN);
+ // If the element does not have a quantifier, return the question mark and element in reverse order.
+ if (quantifier == null) {
+ // If this is an ending permutation element, and the element is marked optional, make it non-optional.
+ if (endingElement && questionMark != null) {
+ return asList(fromTen);
+ } else {
+ return asList(questionMark, fromTen);
+ }
+ } else {
+ // If the element has a quantifier, we must precede the version of the element subtracted from 10 with a version of the element subtracted from
+ // 9, and followed by the quantifier with one fewer occurrence.
+ Node fromNine = subtractSingleCharFrom((SingleCharNode) element, NINE);
+ switch (quantifier.getType()) {
+ case ZERO_OR_MORE:
+ case ONE_OR_MORE:
+ // The new quantifier should be *. Return the elements in reverse order.
+ return asList(fromTen, questionMark, new ZeroOrMoreNode(), fromNine);
+ case REPETITION:
+ // Get the repetition as a range, and subtract 1 from it.
+ Pair range = RegexUtils.getRepetitionAsRange((RepetitionNode) quantifier);
+ range = RegexUtils.subtractOneFrom(range);
+ if (range.getRight() == null) {
+ // The new range is {x,}. Create a new repetition from the range and use that.
+ RepetitionNode fromNineQuantifier = RegexUtils.createRepetition(range);
+ return asList(fromTen, questionMark, fromNineQuantifier, fromNine);
+ } else if (range.getLeft() == 0 && range.getRight() == 0) {
+ // The new range is {0,0}, so zero occurrences. Do not include a version of the element subtracted from 9.
+ return asList(fromTen);
+ } else if (range.getLeft() == 1 && range.getRight() == 1) {
+ // The new range is {1,1}, exactly one occurrence. Include a version of the element subtracted from 9, but do not include a
+ // quantifier.
+ return asList(fromTen, fromNine);
+ } else {
+ // The new range is {x,y}. Create a new repetition from the range and use that.
+ RepetitionNode fromNineQuantifier = RegexUtils.createRepetition(range);
+ return asList(fromTen, questionMark, fromNineQuantifier, fromNine);
+ }
+ default:
+ throw new IllegalArgumentException("Unhandled quantifier type " + quantifier.getType());
+ }
+ }
+ }
+ }
+
+ /**
+ * Handles inverting character classes.
+ */
+ private static class CharClassInverter extends AbstractInverter {
+
+ @Override
+ public List subtractFromNine(Node element, Node quantifier, Node questionMark, boolean endingElement) {
+ // Subtract each element in the character class from 9 and return the elements in reverse order.
+ Node newElement = subtractFrom((CharClassNode) element, NINE);
+ // If this is an ending permutation element, and the element is marked optional, make it non-optional.
+ if (endingElement && quantifier == null && questionMark != null) {
+ return asList(newElement);
+ }
+ return asList(questionMark, quantifier, newElement);
+ }
+
+ @Override
+ public List subtractFromTen(Node element, Node quantifier, Node questionMark, boolean endingElement) {
+ Node fromTen = subtractFrom((CharClassNode) element, TEN);
+ // If the element does not have a quantifier, return the question mark and element in reverse order.
+ if (quantifier == null) {
+ // If this is an ending permutation element, and the element is marked optional, make it non-optional.
+ if (endingElement && questionMark != null) {
+ return asList(fromTen);
+ }
+ return asList(questionMark, fromTen);
+ } else {
+ // If the element has a quantifier, we must precede the version of the element subtracted from 10 with a version of the element subtracted from
+ // 9, and followed by the quantifier with one fewer occurrence.
+ Node fromNine = subtractFrom((CharClassNode) element, NINE);
+ switch (quantifier.getType()) {
+ case ZERO_OR_MORE:
+ case ONE_OR_MORE:
+ // The new quantifier should be *. Return the elements in reverse order.
+ return asList(fromTen, questionMark, new ZeroOrMoreNode(), fromNine);
+ case REPETITION:
+ // Get the repetition as a range, and subtract 1 from it.
+ Pair range = RegexUtils.getRepetitionAsRange((RepetitionNode) quantifier);
+ range = RegexUtils.subtractOneFrom(range);
+ if (range.getRight() == null) {
+ // The new range is {x,}. Create a new repetition from the range and use that.
+ RepetitionNode fromNineQuantifier = RegexUtils.createRepetition(range);
+ return asList(fromTen, questionMark, fromNineQuantifier, fromNine);
+ } else if (range.getLeft() == 0 && range.getRight() == 0) {
+ // The new range is {0,0}, so zero occurrences. Do not include a version of the element subtracted from 9.
+ return asList(fromTen);
+ } else if (range.getLeft() == 1 && range.getRight() == 1) {
+ // The new range is {1,1}, exactly one occurrence. Include a version of the element subtracted from 9, but do not include a
+ // quantifier.
+ return asList(fromTen, fromNine);
+ } else {
+ // The new range is {x,y}. Create a new repetition from the range and use that.
+ RepetitionNode fromNineQuantifier = RegexUtils.createRepetition(range);
+ return asList(fromTen, questionMark, fromNineQuantifier, fromNine);
+ }
+ default:
+ throw new IllegalArgumentException("Unhandled quantifier type " + quantifier.getType());
+ }
+ }
+ }
+
+ private Node subtractFrom(CharClassNode node, int minuend) {
+ List children = new ArrayList<>();
+ for (Node child : node.getChildren()) {
+ // The child is a single char.
+ if (child instanceof SingleCharNode) {
+ // Invert the child as long as we are not trying to subtract 0 from 10. Otherwise, do not retain the child.
+ if (minuend != TEN || !RegexUtils.isChar(child, RegexConstants.ZERO)) {
+ children.add(subtractSingleCharFrom((SingleCharNode) child, minuend));
+ }
+ } else {
+ // The child is a range.
+ CharRangeNode range = (CharRangeNode) child;
+ int rangeStart = toInt(range.getStart());
+ // If the current minuend is 10 and the start of the range is 0, adjust the range to start from 1 instead so that we're not subtracting 0
+ // from 10.
+ if (minuend == TEN && rangeStart == 0) {
+ rangeStart = 1;
+ }
+ int startValue = minuend - rangeStart;
+ int endValue = minuend - toInt(range.getEnd());
+ // If the start value is equal to or less than the end value, return the range as (start-end). Otherwise, return the range as (end-start).
+ if (startValue <= endValue) {
+ children.add(new CharRangeNode(toChar(startValue), toChar(endValue)));
+ } else {
+ children.add(new CharRangeNode(toChar(endValue), toChar(startValue)));
+ }
+ }
+ }
+ // If after inverting the character class, we only have a single character in it, and the character class is not negated, return the single
+ // character rather than a character class.
+ if (children.size() == 1 && children.get(0).getType() == NodeType.SINGLE_CHAR && !node.isNegated()) {
+ return children.get(0);
+ } else {
+ // Otherwise, return a character class. Make a shallow copy in order to also copy over whether the char class is negated.
+ CharClassNode charClass = node.shallowCopy();
+ charClass.addChildren(children);
+ return charClass;
+ }
+ }
+ }
+
+ /**
+ * Handles inverting groups that were inserted into the pattern by {@link ZeroTrimmer}.
+ */
+ private static class GroupInverter extends AbstractInverter {
+
+ @Override
+ public List subtractFromNine(Node element, Node quantifier, Node questionMark, boolean endingElement) {
+ List children = invertGroup(element, NINE, endingElement);
+ // If this is an ending permutation element, return the group flattened.
+ if (endingElement) {
+ return children;
+ } else {
+ // Otherwise return a new group.
+ return createGroup(children, quantifier, questionMark);
+ }
+ }
+
+ @Override
+ public List subtractFromTen(Node element, Node quantifier, Node questionMark, boolean endingElement) {
+ List children = invertGroup(element, TEN, endingElement);
+ // If this is an ending permutation element, and the element is marked optional, make it non-optional.
+ if (endingElement) {
+ return children;
+ } else {
+ // Otherwise return a new group.
+ return createGroup(children, quantifier, questionMark);
+ }
+ }
+
+ // Return the children of the given group inverted.
+ private List invertGroup(Node group, int minuend, boolean endingElement) {
+ // Any group seen here was created by the ZeroTrimmer visitor, and will have at most one element, one quantifier, and one question mark. Fetch them
+ // from the group.
+ NodeListIterator iter = group.getChildrenIterator();
+ Node element = iter.next();
+ Node quantifier = iter.hasNext() && iter.isNextQuantifier() ? iter.next() : null;
+ Node questionMark = iter.hasNext() && iter.isNextQuestionMark() ? iter.next() : null;
+
+ // Fetch the appropriate inverter for the element type.
+ ElementInverter inverter = ElementInverter.forType(element);
+
+ // Invert the elements based on the minuend.
+ List inverted;
+ switch (minuend) {
+ case NINE:
+ inverted = inverter.subtractFromNine(element, quantifier, questionMark, endingElement);
+ break;
+ case TEN:
+ inverted = inverter.subtractFromTen(element, quantifier, questionMark, endingElement);
+ break;
+ default:
+ throw new IllegalArgumentException("Invalid minuend " + minuend);
+ }
+
+ // Return the inverted nodes. We do not need to return them as groups, but can flatten it instead.
+ return inverted;
+ }
+
+ private List createGroup(List children, Node quantifier, Node questionMark) {
+ Collections.reverse(children);
+ return asList(questionMark, quantifier, new GroupNode(children));
+ }
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NegativeVariantExpander.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NegativeVariantExpander.java
new file mode 100644
index 00000000000..b8e9c4e440e
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NegativeVariantExpander.java
@@ -0,0 +1,64 @@
+package datawave.data.normalizer.regex.visitor;
+
+import datawave.data.normalizer.regex.AlternationNode;
+import datawave.data.normalizer.regex.AnyCharNode;
+import datawave.data.normalizer.regex.ExpressionNode;
+import datawave.data.normalizer.regex.Node;
+import datawave.data.normalizer.regex.RegexConstants;
+import datawave.data.normalizer.regex.SingleCharNode;
+
+/**
+ * Implementation of {@link CopyVisitor} that expands all regex expressions with negative variants of sub-expressions where applicable, particularly in the case
+ * of a complete expression with a leading wildcard. See the following examples:
+ *
+ * - Input {@code ".453.*" will return ".4.*|-.4.*"}
+ * - Input {@code ".453.*" will return ".*4|-.*4"}
+ * - Input {@code ".453.*" will return ".*?4|-.*?4"}
+ * - Input {@code ".453.*" will return ".+4|-.+4"}
+ * - Input {@code ".453.*" will return ".+?4|-.+?4"}
+ *
+ * Regexes with leading wildcards that have a negative sign in front of them will not require any expansion.
+ */
+public class NegativeVariantExpander extends SubExpressionVisitor {
+
+ public static Node expand(Node node) {
+ if (node == null) {
+ return null;
+ }
+ NegativeVariantExpander visitor = new NegativeVariantExpander();
+ return (Node) node.accept(visitor, null);
+ }
+
+ @Override
+ protected Object visitSubExpression(Node node) {
+ if (node.getFirstChild() instanceof AnyCharNode) {
+ return expandLeadingWildcard(node);
+ } else {
+ return copy(node);
+ }
+ }
+
+ /**
+ * Return an expression that contains the original expression, as well as a negative variant of it.
+ *
+ * @param node
+ * the expression to expand
+ * @return the expanded expression
+ */
+ private Node expandLeadingWildcard(Node node) {
+ // Create a copy of the original expression.
+ Node negativeCopy = copy(node);
+
+ // Insert a negative sign directly before the wildcard character.
+ SingleCharNode negativeSign = new SingleCharNode(RegexConstants.HYPHEN);
+ negativeCopy.addChild(negativeSign, 0);
+
+ // Create an alternation node with a copy of the original expression and the negative copy as its children.
+ AlternationNode alternation = new AlternationNode();
+ alternation.addChild(copy(node));
+ alternation.addChild(negativeCopy);
+
+ // Return the alternation as the child of a new expression node.
+ return new ExpressionNode(alternation);
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NonEncodedNumbersChecker.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NonEncodedNumbersChecker.java
new file mode 100644
index 00000000000..4b6d029b95f
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NonEncodedNumbersChecker.java
@@ -0,0 +1,68 @@
+package datawave.data.normalizer.regex.visitor;
+
+import datawave.data.normalizer.regex.AlternationNode;
+import datawave.data.normalizer.regex.EncodedNumberNode;
+import datawave.data.normalizer.regex.ExpressionNode;
+import datawave.data.normalizer.regex.Node;
+
+/**
+ * An implementation of {@link BaseVisitor} that will examine a node tree and return whether any non-encoded patterns remain in the tree. This is intended to be
+ * used in conjunction with {@link SimpleNumberEncoder} to see if any further work remains to be done after encoding any and all simple numbers in the tree via
+ * {@link SimpleNumberEncoder#encode(Node)}.
+ *
+ * @see SimpleNumberEncoder
+ */
+public class NonEncodedNumbersChecker extends BaseVisitor {
+
+ /**
+ * Check if there are any non-encoded number patterns still present in the tree.
+ *
+ * @param node
+ * the node to check
+ * @return true if there are any non-encoded patterns, or false otherwise.
+ */
+ public static boolean check(Node node) {
+ NonEncodedNumbersChecker visitor = new NonEncodedNumbersChecker();
+ node.accept(visitor, null);
+ return visitor.hasUnencodedPatterns;
+ }
+
+ private boolean hasUnencodedPatterns = false;
+
+ @Override
+ public Object visitExpression(ExpressionNode node, Object data) {
+ // If we have not yet found any unencoded patterns, check the node.
+ if (!this.hasUnencodedPatterns) {
+ // If we have an alternation, examine the alternation.
+ if (node.getFirstChild() instanceof AlternationNode) {
+ return super.visitExpression(node, data);
+ } else {
+ // Otherwise, check if the node's first child is an encoded number.
+ this.hasUnencodedPatterns = !(node.getFirstChild() instanceof EncodedNumberNode);
+ }
+ }
+ return null;
+ }
+
+ @Override
+ public Object visitAlternation(AlternationNode node, Object data) {
+ // If we have not yet found any unencoded patterns, check each child.
+ if (!this.hasUnencodedPatterns) {
+ for (Node child : node.getChildren()) {
+ child.accept(this, data);
+ // If we found a child with an unencoded pattern, return early.
+ if (this.hasUnencodedPatterns) {
+ break;
+ }
+ }
+
+ }
+ return null;
+ }
+
+ @Override
+ public Object visitEncodedNumber(EncodedNumberNode node, Object data) {
+ // No need to traverse down into the children.
+ return null;
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NumericCharClassValidator.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NumericCharClassValidator.java
new file mode 100644
index 00000000000..e2c98cfb3b7
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/NumericCharClassValidator.java
@@ -0,0 +1,61 @@
+package datawave.data.normalizer.regex.visitor;
+
+import datawave.data.normalizer.regex.CharClassNode;
+import datawave.data.normalizer.regex.CharRangeNode;
+import datawave.data.normalizer.regex.EscapedSingleCharNode;
+import datawave.data.normalizer.regex.Node;
+import datawave.data.normalizer.regex.RegexConstants;
+import datawave.data.normalizer.regex.SingleCharNode;
+
+/**
+ * Implementation of {@link BaseVisitor} that accepts a {@link Node} tree and verifies that any {@link CharClassNode} instances in it only have the following
+ * children:
+ *
+ * - A {@link SingleCharNode} that has a digit.
+ * - A {@link CharRangeNode} that have a digit start and a digit end.
+ *
+ */
+public class NumericCharClassValidator extends BaseVisitor {
+
+ private static final String ERROR_MESSAGE = "Character classes may only contain numeric characters and numeric ranges.";
+
+ public static void validate(Node node) {
+ if (node != null) {
+ NumericCharClassValidator visitor = new NumericCharClassValidator();
+ node.accept(visitor, null);
+ }
+ }
+
+ @Override
+ public Object visitCharClass(CharClassNode node, Object data) {
+ for (Node child : node.getChildren()) {
+ if (child instanceof EscapedSingleCharNode) {
+ // Do not allow any escaped characters.
+ throw new IllegalArgumentException(ERROR_MESSAGE);
+ } else if (child instanceof SingleCharNode) {
+ // Verify the character is a period or digit.
+ validate((SingleCharNode) child);
+ } else if (child instanceof CharRangeNode) {
+ // Verify the range is numeric.
+ validate((CharRangeNode) child);
+ }
+ }
+ return null;
+ }
+
+ private void validate(SingleCharNode node) {
+ if (!RegexConstants.ALL_DIGITS.contains(node.getCharacter())) {
+ throw new IllegalArgumentException(ERROR_MESSAGE);
+ }
+ }
+
+ private void validate(CharRangeNode node) {
+ if (!RegexConstants.ALL_DIGITS.contains(node.getStart())) {
+ throw new IllegalArgumentException(ERROR_MESSAGE);
+ }
+ if (!RegexConstants.ALL_DIGITS.contains(node.getEnd())) {
+ throw new IllegalArgumentException(ERROR_MESSAGE);
+ }
+ }
+
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/OptionalVariantExpander.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/OptionalVariantExpander.java
new file mode 100644
index 00000000000..713d8c64826
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/OptionalVariantExpander.java
@@ -0,0 +1,167 @@
+package datawave.data.normalizer.regex.visitor;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import datawave.data.normalizer.regex.AlternationNode;
+import datawave.data.normalizer.regex.ExpressionNode;
+import datawave.data.normalizer.regex.Node;
+import datawave.data.normalizer.regex.OneOrMoreNode;
+import datawave.data.normalizer.regex.QuestionMarkNode;
+import datawave.data.normalizer.regex.RegexUtils;
+import datawave.data.normalizer.regex.RepetitionNode;
+import datawave.data.normalizer.regex.ZeroOrMoreNode;
+
+/**
+ * Implementation of {@link CopyVisitor} that will return a copy of the tree where elements marked as optional are expanded such each optional character results
+ * in an alternation variant with the optional character present, and an alternation variant not present. This does not apply to optional found after a star,
+ * plus, or repetition quantifier, or any optionals applying to a character that occur after an escaped decimal point; e.g. in the cases of {@code ".*?111"},
+ * {@code ".+?111"}, {@code "14{3}?1"}, or {@code "12\.4?"}. See the following examples of cases where an optional will result in variants.
+ *
+ * - Input {@code "2?"} will return {@code "2"}
+ * - Input {@code "2.?5"} will return {@code "25|2.5"}
+ * - Input {@code "2[3-9]?5"} will return {@code "25|2[2-9]5"}
+ * - Input {@code "27?5"} will return {@code "25|275"}
+ * - Input {@code "2(45.*)?5"} will return {@code "25|2(45.*)5"}
+ * - Input {@code "2\.?5"} will return {@code "25|2\.5"}
+ * - Input {@code "-?25"} will return {@code "25|-25"}
+ *
+ */
+public class OptionalVariantExpander extends SubExpressionVisitor {
+
+ public static Node expand(Node node) {
+ if (node == null) {
+ return null;
+ }
+ OptionalVariantExpander visitor = new OptionalVariantExpander();
+ return (Node) node.accept(visitor, null);
+ }
+
+ @Override
+ protected Object visitSubExpression(Node node) {
+ if (node.isAnyChildOf(QuestionMarkNode.class)) {
+ return expandOptionals(node);
+ } else {
+ return copy(node);
+ }
+ }
+
+ /**
+ * Return an expression that contains the expanded variants of each expanded optional.
+ *
+ * @param node
+ * the expression to expand
+ * @return the expanded expression
+ */
+ private Node expandOptionals(Node node) {
+ List expansions = new ArrayList<>();
+ expansions.add(new ExpressionNode());
+
+ int startIndex = 0;
+ int optionalPos = node.indexOf(QuestionMarkNode.class);
+ int posBeforeOptional = optionalPos - 1;
+ int decimalPoint = RegexUtils.getDecimalPointIndex(node);
+
+ // If the first optional found is after an escaped decimal point, there is no need to do any expansion. Return a copy of the copy.
+ if (decimalPoint != -1 && decimalPoint < posBeforeOptional) {
+ return copy(node);
+ }
+
+ do {
+ // Children from the start index (inclusive) to the position before optional (not inclusive) can be added to each expansion.
+ expansions = addChildrenToExpansions(expansions, node, startIndex, posBeforeOptional);
+ // Move the start index to the position before the optional.
+ startIndex = posBeforeOptional;
+
+ // If the optional is not a modifier to make a quantifier match in lazy mode, add expansions for each variant.
+ Node childBeforeOptional = node.getChildAt(posBeforeOptional);
+ if (!(isOptionalLazyModifierFor(childBeforeOptional))) {
+ expansions = addOptionalElement(expansions, childBeforeOptional);
+ startIndex = optionalPos + 1;
+ }
+
+ // Determine the position of the next optional node, and the child before it.
+ optionalPos = node.indexOf(QuestionMarkNode.class, (optionalPos + 1));
+ posBeforeOptional = optionalPos - 1;
+
+ // If there is an escaped decimal point in the regex, and the next optional is for a character after it, there is no need to do any further
+ // expansion.
+ if (decimalPoint != -1 && decimalPoint < posBeforeOptional) {
+ break;
+ }
+ } while (optionalPos != -1);
+
+ // If we have any remaining children to copy to each expansion, do so.
+ if (startIndex < (node.getChildCount())) {
+ expansions = addChildrenToExpansions(expansions, node, startIndex, node.getChildCount());
+ }
+
+ // Remove any expansions that are leafs without children.
+ expansions = expansions.stream().filter((ex) -> !ex.isLeaf()).collect(Collectors.toList());
+
+ // If we only have one expression after expansion, return the expression.
+ if (expansions.size() == 1) {
+ return expansions.get(0);
+ } else {
+ // Otherwise return an expression containing each expansion as an alternation.
+ return new ExpressionNode(new AlternationNode(expansions));
+ }
+ }
+
+ /**
+ * Return whether the given node is a *, +, or a repetition quantifier.
+ *
+ * @param node
+ * the node
+ * @return true if the node is a *, +, or a repetition quantifier, or false otherwise.
+ */
+ private boolean isOptionalLazyModifierFor(Node node) {
+ return node instanceof ZeroOrMoreNode || node instanceof OneOrMoreNode || node instanceof RepetitionNode;
+ }
+
+ /**
+ * Add the children of the given node from the start index (inclusive) to the end index (not inclusive) to each expansion in the list.
+ *
+ * @param expansions
+ * the expansions
+ * @param node
+ * the node
+ * @param startIndex
+ * the start index of children to copy (inclusive)
+ * @param endIndex
+ * the end index of children to copy (not inclusive)
+ * @return an updated list of expansions
+ */
+ private List addChildrenToExpansions(List expansions, Node node, int startIndex, int endIndex) {
+ List newExpansions = new ArrayList<>();
+ for (Node expansion : expansions) {
+ Node newExpansion = copy(expansion);
+ for (int index = startIndex; index < endIndex; index++) {
+ newExpansion.addChild(copy(node.getChildAt(index)));
+ }
+ newExpansions.add(newExpansion);
+ }
+ return newExpansions;
+ }
+
+ /**
+ * Add the given optional element to each expansion, preserving a copy of each original expansion.
+ *
+ * @param expansions
+ * the expansions
+ * @param optionalElement
+ * the optional element
+ * @return an updated list of expansions
+ */
+ private List addOptionalElement(List expansions, Node optionalElement) {
+ List newExpansions = new ArrayList<>();
+ for (Node expansion : expansions) {
+ newExpansions.add(copy(expansion));
+ Node newExpansion = copy(expansion);
+ newExpansion.addChild(copy(optionalElement));
+ newExpansions.add(newExpansion);
+ }
+ return newExpansions;
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/PrintVisitor.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/PrintVisitor.java
new file mode 100644
index 00000000000..d3d9825b1ed
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/PrintVisitor.java
@@ -0,0 +1,223 @@
+package datawave.data.normalizer.regex.visitor;
+
+import datawave.data.normalizer.regex.AlternationNode;
+import datawave.data.normalizer.regex.AnyCharNode;
+import datawave.data.normalizer.regex.CharClassNode;
+import datawave.data.normalizer.regex.CharRangeNode;
+import datawave.data.normalizer.regex.DigitCharClassNode;
+import datawave.data.normalizer.regex.EmptyNode;
+import datawave.data.normalizer.regex.EncodedNumberNode;
+import datawave.data.normalizer.regex.EncodedPatternNode;
+import datawave.data.normalizer.regex.EndAnchorNode;
+import datawave.data.normalizer.regex.EscapedSingleCharNode;
+import datawave.data.normalizer.regex.ExpressionNode;
+import datawave.data.normalizer.regex.GroupNode;
+import datawave.data.normalizer.regex.IntegerNode;
+import datawave.data.normalizer.regex.IntegerRangeNode;
+import datawave.data.normalizer.regex.Node;
+import datawave.data.normalizer.regex.OneOrMoreNode;
+import datawave.data.normalizer.regex.QuestionMarkNode;
+import datawave.data.normalizer.regex.RepetitionNode;
+import datawave.data.normalizer.regex.SingleCharNode;
+import datawave.data.normalizer.regex.StartAnchorNode;
+import datawave.data.normalizer.regex.ZeroOrMoreNode;
+
+/**
+ * A {@link Visitor} implementation that accepts a {@link Node} tree and streams a pretty-print of it to {@link System#out}.
+ */
+public class PrintVisitor implements Visitor {
+
+ private static final String PREFIX = " ";
+
+ private interface Output {
+ void write(String line);
+ }
+
+ private static class SystemOutput implements Output {
+
+ @Override
+ public void write(String line) {
+ System.out.println(line);
+ }
+ }
+
+ private static class StringBuilderOutput implements Output {
+
+ private final StringBuilder sb = new StringBuilder();
+
+ @Override
+ public void write(String line) {
+ sb.append("\n").append(line);
+ }
+ }
+
+ /**
+ * Streams a pretty-print of the given node to {@link System#out}.
+ *
+ * @param node
+ * the node to print
+ */
+ public static void printToSysOut(Node node) {
+ if (node == null) {
+ System.out.println("null");
+ } else {
+ PrintVisitor visitor = new PrintVisitor(new SystemOutput());
+ node.accept(visitor, "");
+ }
+ }
+
+ /**
+ * Returns a string containing a pretty print of the given node.
+ *
+ * @param node
+ * the node
+ * @return the string
+ */
+ public static String printToString(Node node) {
+ if (node == null) {
+ return "null";
+ } else {
+ StringBuilderOutput output = new StringBuilderOutput();
+ PrintVisitor visitor = new PrintVisitor(output);
+ node.accept(visitor, "");
+ return output.sb.toString();
+ }
+ }
+
+ private final Output output;
+
+ protected PrintVisitor(Output output) {
+ this.output = output;
+ }
+
+ private void print(Node node, Object data) {
+ printLine(node, data);
+ if (node != null) {
+ node.childrenAccept(this, (data + PREFIX));
+ }
+ }
+
+ private void printLine(Node node, Object data) {
+ output.write(data + "" + node);
+ }
+
+ @Override
+ public Object visitExpression(ExpressionNode node, Object data) {
+ print(node, data);
+ return null;
+ }
+
+ @Override
+ public Object visitAlternation(AlternationNode node, Object data) {
+ print(node, data);
+ return null;
+ }
+
+ @Override
+ public Object visitGroup(GroupNode node, Object data) {
+ print(node, data);
+ return null;
+ }
+
+ @Override
+ public Object visitDigitChar(DigitCharClassNode node, Object data) {
+ print(node, data);
+ return null;
+ }
+
+ @Override
+ public Object visitCharClass(CharClassNode node, Object data) {
+ print(node, data);
+ return null;
+ }
+
+ @Override
+ public Object visitCharRange(CharRangeNode node, Object data) {
+ print(node, data);
+ return null;
+ }
+
+ @Override
+ public Object visitSingleChar(SingleCharNode node, Object data) {
+ print(node, data);
+ return null;
+ }
+
+ @Override
+ public Object visitEscapedSingleChar(EscapedSingleCharNode node, Object data) {
+ print(node, data);
+ return null;
+ }
+
+ @Override
+ public Object visitRepetition(RepetitionNode node, Object data) {
+ print(node, data);
+ return null;
+ }
+
+ @Override
+ public Object visitQuestionMark(QuestionMarkNode node, Object data) {
+ print(node, data);
+ return null;
+ }
+
+ @Override
+ public Object visitAnyChar(AnyCharNode node, Object data) {
+ print(node, data);
+ return null;
+ }
+
+ @Override
+ public Object visitZeroToMany(ZeroOrMoreNode node, Object data) {
+ print(node, data);
+ return null;
+ }
+
+ @Override
+ public Object visitOneToMany(OneOrMoreNode node, Object data) {
+ print(node, data);
+ return null;
+ }
+
+ @Override
+ public Object visitInteger(IntegerNode node, Object data) {
+ print(node, data);
+ return null;
+ }
+
+ @Override
+ public Object visitIntegerRange(IntegerRangeNode node, Object data) {
+ print(node, data);
+ return null;
+ }
+
+ @Override
+ public Object visitEmpty(EmptyNode node, Object data) {
+ print(node, data);
+ return null;
+ }
+
+ @Override
+ public Object visitStartAnchor(StartAnchorNode node, Object data) {
+ print(node, data);
+ return null;
+ }
+
+ @Override
+ public Object visitEndAnchor(EndAnchorNode node, Object data) {
+ print(node, data);
+ return null;
+ }
+
+ @Override
+ public Object visitEncodedNumber(EncodedNumberNode node, Object data) {
+ print(node, data);
+ return null;
+ }
+
+ @Override
+ public Object visitEncodedPattern(EncodedPatternNode node, Object data) {
+ print(node, data);
+ return null;
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/SimpleNumberEncoder.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/SimpleNumberEncoder.java
new file mode 100644
index 00000000000..8d24581b64d
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/SimpleNumberEncoder.java
@@ -0,0 +1,90 @@
+package datawave.data.normalizer.regex.visitor;
+
+import datawave.data.normalizer.regex.EncodedNumberNode;
+import datawave.data.normalizer.regex.EndAnchorNode;
+import datawave.data.normalizer.regex.EscapedSingleCharNode;
+import datawave.data.normalizer.regex.Node;
+import datawave.data.normalizer.regex.RegexParser;
+import datawave.data.normalizer.regex.RegexUtils;
+import datawave.data.normalizer.regex.SingleCharNode;
+import datawave.data.normalizer.regex.StartAnchorNode;
+import datawave.data.type.util.NumericalEncoder;
+
+/**
+ * An implementation of {@link CopyVisitor} that will encode any simple-numbers in the regex pattern, and store them inside a {@link EncodedNumberNode}
+ * instance. Any expressions that do not represent a simple number will not be modified. See the following examples:
+ *
+ * - Input {@code "123\.45"} will return {@code "\+cE1\.2345"}.
+ * - Input {@code "23.*"} will return {@code "23.*"}.
+ * - Input {@code "-342|23.*"} will return {@code "!XE6\.58|23.*"}.
+ *
+ */
+public class SimpleNumberEncoder extends SubExpressionVisitor {
+
+ /**
+ * Return a copy of the given tree with all simple numbers encoded.
+ *
+ * @param node
+ * the node to encode
+ * @return the encoded node
+ */
+ public static Node encode(Node node) {
+ if (node == null) {
+ return null;
+ }
+ SimpleNumberEncoder visitor = new SimpleNumberEncoder();
+ return (Node) node.accept(visitor, null);
+ }
+
+ @Override
+ protected Object visitSubExpression(Node node) {
+ // If the expression is a simple number, encode it.
+ if (RegexUtils.isSimpleNumber(node)) {
+ Node normalized = normalizeNumber(node);
+ return new EncodedNumberNode(normalized.getChildren());
+ } else {
+ // Otherwise return a copy.
+ return copy(node);
+ }
+ }
+
+ /**
+ * Create an encoded simple number regex from the given node. It is expected that the given node represents a simple number regex.
+ *
+ * @param node
+ * the node to encode
+ * @return the encoded node.
+ */
+ private Node normalizeNumber(Node node) {
+ // Create a number string from the node. Do not include backlashes or anchor characters.
+ StringBuilder sb = new StringBuilder();
+ for (Node child : node.getChildren()) {
+ if (child instanceof EscapedSingleCharNode) {
+ sb.append(((EscapedSingleCharNode) child).getCharacter());
+ } else if (child instanceof SingleCharNode) {
+ sb.append(((SingleCharNode) child).getCharacter());
+ }
+ }
+
+ // Encode and escape the number.
+ String encodedNumber = NumericalEncoder.encode(sb.toString());
+ encodedNumber = RegexUtils.escapeEncodedNumber(encodedNumber);
+
+ // Parse the number to a node.
+ Node encodedNode = RegexParser.parse(encodedNumber);
+
+ // If the original expression contained a starting anchor, include it in the encoded node.
+ Node firstChild = node.getFirstChild();
+ if (firstChild instanceof StartAnchorNode) {
+ encodedNode.addChild(firstChild.shallowCopy(), 0);
+ }
+
+ // If the original expression contained an ending anchor, include it in the encoded node.
+ Node lastChild = node.getLastChild();
+ if (lastChild instanceof EndAnchorNode) {
+ encodedNode.addChild(lastChild.shallowCopy());
+ }
+
+ return encodedNode;
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/StringVisitor.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/StringVisitor.java
new file mode 100644
index 00000000000..3f88cf1ff9c
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/StringVisitor.java
@@ -0,0 +1,192 @@
+package datawave.data.normalizer.regex.visitor;
+
+import java.util.Iterator;
+
+import datawave.data.normalizer.regex.AlternationNode;
+import datawave.data.normalizer.regex.AnyCharNode;
+import datawave.data.normalizer.regex.CharClassNode;
+import datawave.data.normalizer.regex.CharRangeNode;
+import datawave.data.normalizer.regex.DigitCharClassNode;
+import datawave.data.normalizer.regex.EmptyNode;
+import datawave.data.normalizer.regex.EncodedNumberNode;
+import datawave.data.normalizer.regex.EncodedPatternNode;
+import datawave.data.normalizer.regex.EndAnchorNode;
+import datawave.data.normalizer.regex.EscapedSingleCharNode;
+import datawave.data.normalizer.regex.ExpressionNode;
+import datawave.data.normalizer.regex.GroupNode;
+import datawave.data.normalizer.regex.IntegerNode;
+import datawave.data.normalizer.regex.IntegerRangeNode;
+import datawave.data.normalizer.regex.Node;
+import datawave.data.normalizer.regex.OneOrMoreNode;
+import datawave.data.normalizer.regex.QuestionMarkNode;
+import datawave.data.normalizer.regex.RepetitionNode;
+import datawave.data.normalizer.regex.SingleCharNode;
+import datawave.data.normalizer.regex.StartAnchorNode;
+import datawave.data.normalizer.regex.ZeroOrMoreNode;
+
+public class StringVisitor implements Visitor {
+
+ public static String toString(Node node) {
+ if (node == null) {
+ return null;
+ }
+ StringVisitor visitor = new StringVisitor();
+ StringBuilder sb = new StringBuilder();
+ node.accept(visitor, sb);
+ return sb.toString();
+ }
+
+ @Override
+ public Object visitExpression(ExpressionNode node, Object data) {
+ node.childrenAccept(this, data);
+ return null;
+ }
+
+ @Override
+ public Object visitAlternation(AlternationNode node, Object data) {
+ StringBuilder sb = (StringBuilder) data;
+ Iterator iterator = node.getChildren().iterator();
+ while (iterator.hasNext()) {
+ iterator.next().accept(this, sb);
+ if (iterator.hasNext()) {
+ sb.append("|");
+ }
+ }
+ return null;
+ }
+
+ @Override
+ public Object visitGroup(GroupNode node, Object data) {
+ StringBuilder sb = (StringBuilder) data;
+ sb.append("(");
+ node.childrenAccept(this, sb);
+ sb.append(")");
+ return null;
+ }
+
+ @Override
+ public Object visitDigitChar(DigitCharClassNode node, Object data) {
+ StringBuilder sb = (StringBuilder) data;
+ sb.append("\\d");
+ return null;
+ }
+
+ @Override
+ public Object visitCharClass(CharClassNode node, Object data) {
+ StringBuilder sb = (StringBuilder) data;
+ sb.append("[");
+ if (node.isNegated()) {
+ sb.append("^");
+ }
+ node.childrenAccept(this, sb);
+ sb.append("]");
+ return null;
+ }
+
+ @Override
+ public Object visitCharRange(CharRangeNode node, Object data) {
+ StringBuilder sb = (StringBuilder) data;
+ sb.append(node.getStart()).append("-").append(node.getEnd());
+ return null;
+ }
+
+ @Override
+ public Object visitSingleChar(SingleCharNode node, Object data) {
+ StringBuilder sb = (StringBuilder) data;
+ sb.append(node.getCharacter());
+ return null;
+ }
+
+ @Override
+ public Object visitEscapedSingleChar(EscapedSingleCharNode node, Object data) {
+ StringBuilder sb = (StringBuilder) data;
+ sb.append("\\").append(node.getCharacter());
+ return null;
+ }
+
+ @Override
+ public Object visitRepetition(RepetitionNode node, Object data) {
+ StringBuilder sb = (StringBuilder) data;
+ sb.append("{");
+ node.childrenAccept(this, sb);
+ sb.append("}");
+ return null;
+ }
+
+ @Override
+ public Object visitQuestionMark(QuestionMarkNode node, Object data) {
+ StringBuilder sb = (StringBuilder) data;
+ sb.append("?");
+ return null;
+ }
+
+ @Override
+ public Object visitAnyChar(AnyCharNode node, Object data) {
+ StringBuilder sb = (StringBuilder) data;
+ sb.append(".");
+ return null;
+ }
+
+ @Override
+ public Object visitZeroToMany(ZeroOrMoreNode node, Object data) {
+ StringBuilder sb = (StringBuilder) data;
+ sb.append("*");
+ return null;
+ }
+
+ @Override
+ public Object visitOneToMany(OneOrMoreNode node, Object data) {
+ StringBuilder sb = (StringBuilder) data;
+ sb.append("+");
+ return null;
+ }
+
+ @Override
+ public Object visitInteger(IntegerNode node, Object data) {
+ StringBuilder sb = (StringBuilder) data;
+ sb.append(node.getValue());
+ return null;
+ }
+
+ @Override
+ public Object visitIntegerRange(IntegerRangeNode node, Object data) {
+ StringBuilder sb = (StringBuilder) data;
+ sb.append(node.getStart());
+ sb.append(",");
+ if (node.isEndBounded()) {
+ sb.append(node.getEnd());
+ }
+ return null;
+ }
+
+ @Override
+ public Object visitEmpty(EmptyNode node, Object data) {
+ return null;
+ }
+
+ @Override
+ public Object visitStartAnchor(StartAnchorNode node, Object data) {
+ StringBuilder sb = (StringBuilder) data;
+ sb.append("^");
+ return null;
+ }
+
+ @Override
+ public Object visitEndAnchor(EndAnchorNode node, Object data) {
+ StringBuilder sb = (StringBuilder) data;
+ sb.append("$");
+ return null;
+ }
+
+ @Override
+ public Object visitEncodedNumber(EncodedNumberNode node, Object data) {
+ node.childrenAccept(this, data);
+ return null;
+ }
+
+ @Override
+ public Object visitEncodedPattern(EncodedPatternNode node, Object data) {
+ node.childrenAccept(this, data);
+ return null;
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/SubExpressionVisitor.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/SubExpressionVisitor.java
new file mode 100644
index 00000000000..53919964b2f
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/SubExpressionVisitor.java
@@ -0,0 +1,98 @@
+package datawave.data.normalizer.regex.visitor;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import com.google.common.collect.ImmutableSet;
+
+import datawave.data.normalizer.regex.AlternationNode;
+import datawave.data.normalizer.regex.EncodedNumberNode;
+import datawave.data.normalizer.regex.EncodedPatternNode;
+import datawave.data.normalizer.regex.ExpressionNode;
+import datawave.data.normalizer.regex.GroupNode;
+import datawave.data.normalizer.regex.Node;
+
+/**
+ * An implementation of {@link CopyVisitor} that allows delegation of operations to be performed on sub-expressions of a regex, specifically, each alternated
+ * expression of a regex with alternations, or the entire expression if no alternations are present.
+ */
+public class SubExpressionVisitor extends CopyVisitor {
+
+ private static final Set> VALID_TOP_LEVEL_TYPES = ImmutableSet.of(GroupNode.class, EncodedNumberNode.class, EncodedPatternNode.class);
+
+ @Override
+ public Object visitExpression(ExpressionNode node, Object data) {
+ if (node.getFirstChild() instanceof AlternationNode) {
+ return super.visitExpression(node, data);
+ } else {
+ return visitSubExpression(node);
+ }
+ }
+
+ /**
+ * By default, return a copy of the sub-expression. This method should be overridden by any subclasses that need to manipulate sub-expressions.
+ *
+ * @param node
+ * the sub-expression
+ * @return the visited sub-expression
+ */
+ protected Object visitSubExpression(Node node) {
+ return copy(node);
+ }
+
+ /**
+ * Visit each sub-expression of the alternation with this visitor.
+ *
+ * @param node
+ * the alternation node
+ * @param data
+ * the data
+ * @return null if all visited children returned null, an {@link ExpressionNode} if a single visited child returned a non-null result, or an
+ * {@link AlternationNode} with all non-null results from visited children
+ */
+ @Override
+ public Object visitAlternation(AlternationNode node, Object data) {
+ List children = new ArrayList<>();
+ // Visit each alternated child.
+ for (Node child : node.getChildren()) {
+ Node visited = (Node) child.accept(this, data);
+ // Do not retain null children.
+ if (visited != null) {
+ // If the returned node is an alternation node, retain each child of the returned alternation node.
+ if (visited instanceof AlternationNode) {
+ children.addAll(visited.getChildren());
+ } else if (visited instanceof ExpressionNode) {
+ if (visited.getChildCount() == 1 && visited.getFirstChild() instanceof AlternationNode) {
+ // If the returned node is an expression with an alternation child, retain each child of the alternation node.
+ children.addAll(visited.getFirstChild().getChildren());
+ } else if (visited.getChildCount() == 1 && VALID_TOP_LEVEL_TYPES.contains(visited.getFirstChild().getClass())) {
+ // If the returned node is an expression with a single child that is a top-level node type, retain the first child.
+ children.add(visited.getFirstChild());
+ } else {
+ // Otherwise retain the entire expression.
+ children.add(visited);
+ }
+ } else if (VALID_TOP_LEVEL_TYPES.contains(visited.getClass())) {
+ // If the returned node is a valid top-level class, retain it.
+ children.add(visited);
+ } else {
+ throw new IllegalArgumentException("Visited alternation child must be alternation or expression, but was " + visited);
+ }
+ }
+ }
+
+ // If there are no children, return null.
+ if (children.isEmpty()) {
+ return null;
+ } else if (children.size() == 1) {
+ // If there is only one child, return the child.
+ return children.get(0);
+ } else {
+ // Otherwise return a new alternation node.
+ AlternationNode copy = new AlternationNode();
+ copy.addChildren(children);
+ return copy;
+ }
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/Visitor.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/Visitor.java
new file mode 100644
index 00000000000..b9ebcc01dd0
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/Visitor.java
@@ -0,0 +1,65 @@
+package datawave.data.normalizer.regex.visitor;
+
+import datawave.data.normalizer.regex.AlternationNode;
+import datawave.data.normalizer.regex.AnyCharNode;
+import datawave.data.normalizer.regex.CharClassNode;
+import datawave.data.normalizer.regex.CharRangeNode;
+import datawave.data.normalizer.regex.DigitCharClassNode;
+import datawave.data.normalizer.regex.EmptyNode;
+import datawave.data.normalizer.regex.EncodedNumberNode;
+import datawave.data.normalizer.regex.EncodedPatternNode;
+import datawave.data.normalizer.regex.EndAnchorNode;
+import datawave.data.normalizer.regex.EscapedSingleCharNode;
+import datawave.data.normalizer.regex.ExpressionNode;
+import datawave.data.normalizer.regex.GroupNode;
+import datawave.data.normalizer.regex.IntegerNode;
+import datawave.data.normalizer.regex.IntegerRangeNode;
+import datawave.data.normalizer.regex.OneOrMoreNode;
+import datawave.data.normalizer.regex.QuestionMarkNode;
+import datawave.data.normalizer.regex.RepetitionNode;
+import datawave.data.normalizer.regex.SingleCharNode;
+import datawave.data.normalizer.regex.StartAnchorNode;
+import datawave.data.normalizer.regex.ZeroOrMoreNode;
+
+public interface Visitor {
+
+ Object visitExpression(ExpressionNode node, Object data);
+
+ Object visitAlternation(AlternationNode node, Object data);
+
+ Object visitGroup(GroupNode node, Object data);
+
+ Object visitDigitChar(DigitCharClassNode node, Object data);
+
+ Object visitCharClass(CharClassNode node, Object data);
+
+ Object visitCharRange(CharRangeNode node, Object data);
+
+ Object visitSingleChar(SingleCharNode node, Object data);
+
+ Object visitEscapedSingleChar(EscapedSingleCharNode node, Object data);
+
+ Object visitRepetition(RepetitionNode node, Object data);
+
+ Object visitQuestionMark(QuestionMarkNode node, Object data);
+
+ Object visitAnyChar(AnyCharNode node, Object data);
+
+ Object visitZeroToMany(ZeroOrMoreNode node, Object data);
+
+ Object visitOneToMany(OneOrMoreNode node, Object data);
+
+ Object visitInteger(IntegerNode node, Object data);
+
+ Object visitIntegerRange(IntegerRangeNode node, Object data);
+
+ Object visitEmpty(EmptyNode node, Object data);
+
+ Object visitStartAnchor(StartAnchorNode node, Object data);
+
+ Object visitEndAnchor(EndAnchorNode node, Object data);
+
+ Object visitEncodedNumber(EncodedNumberNode node, Object data);
+
+ Object visitEncodedPattern(EncodedPatternNode node, Object data);
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ZeroLengthRepetitionTrimmer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ZeroLengthRepetitionTrimmer.java
new file mode 100644
index 00000000000..5c7eb5b8cd3
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ZeroLengthRepetitionTrimmer.java
@@ -0,0 +1,97 @@
+package datawave.data.normalizer.regex.visitor;
+
+import datawave.data.normalizer.regex.ExpressionNode;
+import datawave.data.normalizer.regex.IntegerNode;
+import datawave.data.normalizer.regex.IntegerRangeNode;
+import datawave.data.normalizer.regex.Node;
+import datawave.data.normalizer.regex.NodeListIterator;
+import datawave.data.normalizer.regex.QuestionMarkNode;
+import datawave.data.normalizer.regex.RepetitionNode;
+
+/**
+ * Implementation of {@link CopyVisitor} that will return a copy of the tree trimmed of any characters that were immediately followed by a zero-length
+ * repetition quantifier, i.e. {@code {0}} or {@code {0,0}}. See the following examples:
+ *
+ * - Input {@code "123.*"} will return {@code "123.*"}.
+ * - Input {@code "123{3}"} will return {@code "123{3}"}.
+ * - Input {@code "12[3-6]{0}"} will return {@code "12"}.
+ * - Input {@code "12[3-6]{0,0}"} will return {@code "12"}.
+ * - Input {@code "2{0,0}|3{0}"} will return null.
+ *
+ */
+public class ZeroLengthRepetitionTrimmer extends SubExpressionVisitor {
+
+ /**
+ * Return a copy of the given tree trimmed of all characters followed by a zero-length repetition quantifier. If the entire tree is trimmed, null will be
+ * returned, otherwise an {@link ExpressionNode} with the trimmed tree will be returned.
+ *
+ * @param node
+ * the node to trim
+ * @return the trimmed node
+ */
+ public static Node trim(Node node) {
+ if (node == null) {
+ return null;
+ }
+ ZeroLengthRepetitionTrimmer visitor = new ZeroLengthRepetitionTrimmer();
+ return (Node) node.accept(visitor, null);
+ }
+
+ @Override
+ public Object visitExpression(ExpressionNode node, Object data) {
+ Node visited = (Node) super.visitExpression(node, data);
+ return visited != null && visited.isLeaf() ? null : visited;
+ }
+
+ @Override
+ protected Object visitSubExpression(Node node) {
+ Node copy = new ExpressionNode();
+ NodeListIterator iter = node.getChildrenIterator();
+
+ // Check each child for any zero-length repetitions.
+ while (iter.hasNext()) {
+ Node next = iter.next();
+ if (iter.hasNext() && iter.isNextInstanceOf(RepetitionNode.class)) {
+ Node repetition = iter.next();
+ // If we have a zero-length repetition, do not copy it.
+ if (isZeroLengthRepetition(repetition)) {
+ // If there is a ? after the repetition, move past it.
+ if (iter.hasNext() && iter.isNextInstanceOf(QuestionMarkNode.class)) {
+ iter.next();
+ }
+ } else {
+ // Otherwise this is a non-zero length repetition. Copy it.
+ copy.addChild(copy(next));
+ copy.addChild(copy(repetition));
+ }
+ } else {
+ // The child is not followed by a repetition. Copy it.
+ copy.addChild(copy(next));
+ }
+ }
+
+ // If we have any children after removing zero-length repetitions, return the copy. Otherwise, return null.
+ if (copy.hasChildren()) {
+ return copy;
+ } else {
+ return null;
+ }
+ }
+
+ /**
+ * Return whether the given repetition is {@code {0}} or {@code {0,0}}.
+ *
+ * @param node
+ * the node
+ * @return true if the given repetition is a zero-length repetition, or false otherwise
+ */
+ private boolean isZeroLengthRepetition(Node node) {
+ Node child = node.getFirstChild();
+ if (child instanceof IntegerNode) {
+ return ((IntegerNode) child).getValue() == 0;
+ } else {
+ IntegerRangeNode rangeNode = (IntegerRangeNode) child;
+ return rangeNode.getStart() == 0 && rangeNode.isEndBounded() && rangeNode.getEnd() == 0;
+ }
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ZeroTrimmer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ZeroTrimmer.java
new file mode 100644
index 00000000000..fec62759a6e
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ZeroTrimmer.java
@@ -0,0 +1,722 @@
+package datawave.data.normalizer.regex.visitor;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import datawave.data.normalizer.ZeroRegexStatus;
+import datawave.data.normalizer.regex.AnyCharNode;
+import datawave.data.normalizer.regex.EncodedPatternNode;
+import datawave.data.normalizer.regex.EscapedSingleCharNode;
+import datawave.data.normalizer.regex.ExpressionNode;
+import datawave.data.normalizer.regex.GroupNode;
+import datawave.data.normalizer.regex.IntegerNode;
+import datawave.data.normalizer.regex.IntegerRangeNode;
+import datawave.data.normalizer.regex.Node;
+import datawave.data.normalizer.regex.NodeListIterator;
+import datawave.data.normalizer.regex.NodeType;
+import datawave.data.normalizer.regex.OneOrMoreNode;
+import datawave.data.normalizer.regex.QuestionMarkNode;
+import datawave.data.normalizer.regex.RegexConstants;
+import datawave.data.normalizer.regex.RegexUtils;
+import datawave.data.normalizer.regex.RepetitionNode;
+import datawave.data.normalizer.regex.SingleCharNode;
+import datawave.data.normalizer.regex.ZeroOrMoreNode;
+
+/**
+ * Implementation of {@link CopyVisitor} that trims and consolidates leading zeros for partially encoded regex patterns.
+ */
+public class ZeroTrimmer extends CopyVisitor {
+
+ /**
+ * Return a copy of the node tree with all leading zeros for partially encoded regex patterns either trimmed and/or consolidated.
+ *
+ * @param node
+ * the node
+ * @return the trimmed tree
+ */
+ public static Node trim(Node node) {
+ if (node == null) {
+ return null;
+ }
+ ZeroTrimmer visitor = new ZeroTrimmer();
+ return (Node) node.accept(visitor, null);
+ }
+
+ public static ZeroRegexStatus getStatus(List encodedRegexNodes) {
+ if (hasPossiblyLeadingZeroes(encodedRegexNodes)) {
+ return ZeroRegexStatus.LEADING;
+ } else if (hasTrailingZeroes(encodedRegexNodes)) {
+ return ZeroRegexStatus.TRAILING;
+ } else
+ return ZeroRegexStatus.NONE;
+
+ }
+
+ private static boolean hasTrailingZeroes(List encodedRegexNodes) {
+ Collections.reverse(encodedRegexNodes);
+
+ NodeListIterator iter = new NodeListIterator(encodedRegexNodes);
+
+ while (iter.hasNext()) {
+ iter.seekPastQuestionMarks();
+ iter.seekPastQuantifiers();
+ iter.seekPastQuestionMarks();
+
+ Node next = iter.peekNext();
+
+ if (RegexUtils.matchesZero(next)) {
+ if (RegexUtils.matchesZeroExplicitly(next)) {
+ return true;
+ }
+ iter.next();
+ } else {
+ return false;
+ }
+
+ }
+ return true;
+
+ }
+
+ private static boolean hasPossiblyLeadingZeroes(List encodedRegexNodes) {
+ NodeListIterator iter = new NodeListIterator(encodedRegexNodes);
+
+ while (iter.hasNext()) {
+ Node next = iter.peekNext();
+
+ if (RegexUtils.matchesZero(next)) {
+ return true;
+ } else if (RegexUtils.isChar(next, RegexConstants.HYPHEN) || next.equals(new EscapedSingleCharNode(RegexConstants.PERIOD))) {
+ iter.next();
+ } else {
+ return false;
+ }
+ }
+
+ return true;
+
+ }
+
+ @Override
+ public Object visitEncodedPattern(EncodedPatternNode node, Object data) {
+ EncodedPatternNode trimmed = new EncodedPatternNode();
+
+ // Create a new node and add each child up to (inclusively) the 'E' character.
+ int startOfRemainingNodes = 0;
+ for (int i = 0; i < node.getChildCount(); i++) {
+ Node child = node.getChildAt(i);
+ trimmed.addChild(copy(child));
+ if (RegexUtils.isChar(child, RegexConstants.CAPITAL_E)) {
+ startOfRemainingNodes = i + 1;
+ break;
+ }
+ }
+
+ // Copy the remaining children into a separate list. This list will be modified as zeros are trimmed.
+ List nodes = new ArrayList<>();
+ for (int i = startOfRemainingNodes; i < node.getChildCount(); i++) {
+ Node child = node.getChildAt(i);
+ // At this point we no longer need to keep the original decimal point. A new decimal point will be added later in the correct spot.
+ if (!RegexUtils.isDecimalPoint(child)) {
+ nodes.add(copy(child));
+ }
+ }
+
+ // Check if the remaining children represent a single regex element. If so, no trimming is required.
+ if (isSingleElementPattern(nodes)) {
+ trimmed.addChildren(nodes);
+ return trimmed;
+ }
+
+ // Trim leading and trailing zeros.
+ nodes = trimLeadingZeros(nodes);
+ nodes = trimTrailingZeros(nodes);
+
+ // Add the new nodes to the node to return.
+ trimmed.addChildren(nodes);
+ return trimmed;
+ }
+
+ /**
+ * Trim/consolidate leading zeros.
+ *
+ * @param nodes
+ * the nodes to trim
+ * @return the trimmed nodes
+ */
+ private List trimLeadingZeros(List nodes) {
+ nodes = trimLeadingZeroOnlyElements(nodes);
+ return consolidatePossibleLeadingZeros(nodes);
+ }
+
+ /**
+ * Trim/consolidate trailing zeros.
+ *
+ * @param nodes
+ * the nodes to trim
+ * @return the trimmed nodes
+ */
+ private List trimTrailingZeros(List nodes) {
+ // Reverse the nodes.
+ Collections.reverse(nodes);
+ nodes = trimTrailingZeroOnlyElements(nodes);
+ nodes = consolidatePossibleTrailingZeros(nodes);
+ // Restore the original order.
+ Collections.reverse(nodes);
+ return nodes;
+ }
+
+ /**
+ * Return true if the given list consists only of one regex element that may or may not be followed by a quantifier or question mark.
+ *
+ * @param nodes
+ * the nodes
+ * @return true if the list consists of a single element pattern, or false otherwise
+ */
+ private boolean isSingleElementPattern(List nodes) {
+ NodeListIterator iter = new NodeListIterator(nodes);
+ iter.next();
+ iter.seekPastQuantifiers();
+ iter.seekPastQuestionMarks();
+ return !iter.hasNext();
+ }
+
+ /**
+ * Trim all leading nodes that only match zero. Trimming will stop once the first element that can match something other than zero is seen.
+ *
+ * @param nodes
+ * the nodes
+ * @return a list of trimmed nodes
+ */
+ private List trimLeadingZeroOnlyElements(List nodes) {
+ NodeListIterator iter = new NodeListIterator(nodes);
+ while (iter.hasNext()) {
+ Node next = iter.peekNext();
+ // If the next element matches zero only, skip past it, and any quantifiers and/or question marks after it.
+ if (RegexUtils.matchesZeroOnly(next)) {
+ iter.next();
+ iter.seekPastQuantifiers();
+ iter.seekPastQuestionMarks();
+ } else {
+ break;
+ }
+ }
+
+ // If no leading zeros were seen, return the original list, otherwise return a sublist.
+ return iter.index() == 0 ? nodes : new ArrayList<>(nodes.subList(iter.index(), nodes.size()));
+ }
+
+ /**
+ * Return a list with all possible leading zeros consolidated, and any elements made optional as needed.
+ *
+ * @param nodes
+ * the nodes to consolidate
+ * @return a list of consolidated nodes
+ */
+ private List consolidatePossibleLeadingZeros(List nodes) {
+ // If the first node cannot match zero, there is nothing further to do. Return the entire list.
+ if (!RegexUtils.matchesZero(nodes.get(0))) {
+ return nodes;
+ }
+
+ // Iterate through each child.
+ NodeListIterator iter = new NodeListIterator(nodes);
+ List consolidated = new ArrayList<>();
+ while (iter.hasNext()) {
+ // Do not call next until we know the next node can match zero.
+ Node next = iter.peekNext();
+ // The next node can match zero. Call next, and call the specific consolidation method based on whether the node can match only zero, or other
+ // numbers.
+ if (RegexUtils.matchesZero(next)) {
+ if (RegexUtils.matchesZeroOnly(next)) {
+ consolidated.addAll(consolidateLeadingMatchesZeroOnly(iter));
+ } else {
+ consolidated.addAll(consolidateLeadingMatchesZero(iter));
+ }
+ } else {
+ break;
+ }
+ }
+
+ // Add the remaining nodes to the list to return.
+ while (iter.hasNext()) {
+ consolidated.add(iter.next());
+ }
+ return consolidated;
+ }
+
+ /**
+ * Consolidate any leading zeros that can possibly match zero.
+ *
+ * @param iter
+ * the iterator
+ * @return the consolidated nodes.
+ */
+ private List consolidateLeadingMatchesZero(NodeListIterator iter) {
+ List nodes = new ArrayList<>();
+ while (iter.hasNext()) {
+ // Do not call next until we know the next node can match zero.
+ Node next = iter.peekNext();
+ // The next node can match zero. The first call to next should always return an element that can match zero, but not only zero.
+ if (RegexUtils.matchesZero(next)) {
+ iter.next();
+ // If the node is followed by a quantifier and/or optional, evaluate the quantifier.
+ if (iter.isNextQuantifier()) {
+ Node quantifier = iter.next();
+ switch (quantifier.getType()) {
+ case ZERO_OR_MORE:
+ case ONE_OR_MORE:
+ // In both the case of * or + for a leading zero, we must ensure that * is used in the final regex to allow for zero occurrences of
+ // the leading zero when matching.
+ nodes.add(next);
+ nodes.add(new ZeroOrMoreNode());
+ // If the quantifier was followed by ?, append the ?.
+ if (iter.isNextQuestionMark()) {
+ nodes.add(iter.next());
+ }
+ break;
+ case REPETITION:
+ RepetitionNode repetition = (RepetitionNode) quantifier;
+ // If the repetition does not already allow for zero occurrences, we must create a new repetition quantifier that does so.
+ if (!RegexUtils.repetitionCanOccurZeroTimes(repetition)) {
+ if (RegexUtils.isNotRange(repetition)) {
+ // If the repetition is has the form {x}, replace it with {0,x}. For example, "[012]{3}" will become "[012]{0,3}".
+ nodes.add(next);
+ nodes.add(RegexUtils.createRangeStartingFromZero(repetition));
+ // If the original quantifier was followed by ?, append it.
+ if (iter.isNextQuestionMark()) {
+ nodes.add(iter.next());
+ }
+ } else {
+ // If the repetition has the form {x,y}, where x is a value greater than zero, we must wrap the element and the repetition
+ // in an optional group to allow for it to occur either zero times, or x-y times. For example, "[012]{3,5}" will become
+ // "([012]{3,5})?". Create a group node with the element and repetition as its children.
+ GroupNode groupNode = new GroupNode();
+ groupNode.addChild(next);
+ groupNode.addChild(repetition);
+ // If the original quantifier was followed by ?, include it in the group.
+ if (iter.isNextQuestionMark()) {
+ groupNode.addChild(iter.next());
+ }
+ // Add the group node and make it optional.
+ nodes.add(groupNode);
+ nodes.add(new QuestionMarkNode());
+ }
+ } else {
+ // The repetition allows for zero occurrences. No modifications need to be made.
+ nodes.add(next);
+ nodes.add(repetition);
+ if (iter.isNextQuestionMark()) {
+ nodes.add(iter.next());
+ }
+ }
+ break;
+ default:
+ throw new IllegalArgumentException("Unsupported quantifier type: " + quantifier.getType());
+ }
+ } else {
+ // Add the node and make it optional since it can possibly be a leading zero, and thus must be optional.
+ nodes.add(next);
+ nodes.add(new QuestionMarkNode());
+ }
+
+ // If there are any elements directly after the current element that only match zero, consolidate then and add the result.
+ if (iter.hasNext() && RegexUtils.matchesZeroOnly(iter.peekNext())) {
+ nodes.addAll(consolidateLeadingMatchesZeroOnly(iter));
+ }
+ } else {
+ // The next element cannot match zero. Nothing more to do.
+ break;
+ }
+ }
+ return nodes;
+ }
+
+ /**
+ * Consolidate the next consecutive elements that can only match zero.
+ *
+ * @param iter
+ * the iterator
+ * @return a list of the consolidated nodes
+ */
+ private List consolidateLeadingMatchesZeroOnly(NodeListIterator iter) {
+ // We need to track the minimum and maximum times a leading zero can occur.
+ int minZeroCount = 0;
+ int maxZeroCount = 0;
+
+ while (iter.hasNext()) {
+ // Do not call next until we've confirmed the next node only matches zero.
+ Node next = iter.peekNext();
+ if (RegexUtils.matchesZeroOnly(next)) {
+ // Explicitly call next now.
+ iter.next();
+ // If the zero has a quantifier, extract the quantifier range.
+ if (iter.isNextQuantifier()) {
+ Pair quantifierRange = RegexUtils.getQuantifierRange(iter.next());
+ // Increment the lower bound.
+ minZeroCount += quantifierRange.getLeft();
+ if (maxZeroCount != -1) {
+ // If the quantifier range has no defined upper bound, that is equivalent to unlimited. Set the max bound to -1 to ensure it is not
+ // changed.
+ if (quantifierRange.getRight() == null) {
+ maxZeroCount = -1;
+ } else {
+ // Otherwise increment the upper bound.
+ maxZeroCount += quantifierRange.getRight();
+ }
+ }
+ } else {
+ // The zero does not have a quantifier. Increment the min count by one, and increment the max count only if we have not yet determined that
+ // the max should be considered unlimited.
+ minZeroCount++;
+ if (maxZeroCount != -1) {
+ maxZeroCount++;
+ }
+ }
+ // Skip any question marks if present.
+ iter.seekPastQuestionMarks();
+ } else {
+ // If the next node does not only match zero, stop iterating.
+ break;
+ }
+ }
+
+ List nodes = new ArrayList<>();
+ // If the min and max are both 1, return 0?
+ if (minZeroCount == 1 && maxZeroCount == 1) {
+ nodes.add(new SingleCharNode(RegexConstants.ZERO));
+ nodes.add(new QuestionMarkNode());
+ } else {
+ // Otherwise we need return 0 followed by a quantifier inside an optional group.
+ GroupNode groupNode = new GroupNode();
+ groupNode.addChild(new SingleCharNode(RegexConstants.ZERO));
+
+ if (maxZeroCount == -1 && minZeroCount < 2) {
+ if (minZeroCount == 0) {
+ // Return (0*)?
+ groupNode.addChild(new ZeroOrMoreNode());
+ } else if (minZeroCount == 1) {
+ // Return (0+)?
+ groupNode.addChild(new OneOrMoreNode());
+ }
+ } else {
+ RepetitionNode repetition = new RepetitionNode();
+ if (minZeroCount == maxZeroCount) {
+ // Return (0{x})?
+ IntegerNode integer = new IntegerNode(minZeroCount);
+ repetition.addChild(integer);
+ } else {
+ // Return (0{x,y})? or (0{x,})? if unlimited max.
+ IntegerRangeNode integerRange = new IntegerRangeNode();
+ integerRange.setStart(minZeroCount);
+ if (maxZeroCount != -1) {
+ integerRange.setEnd(maxZeroCount);
+ }
+ repetition.addChild(integerRange);
+ }
+
+ groupNode.addChild(repetition);
+ }
+ nodes.add(groupNode);
+ // Ensure the group is optional.
+ nodes.add(new QuestionMarkNode());
+ }
+
+ return nodes;
+ }
+
+ /**
+ * Trim all trailing nodes that explicitly only match zero. Trimming will stop once the first element that can match something other than zero is seen.
+ *
+ * @param nodes
+ * the nodes
+ * @return a list of trimmed nodes
+ */
+ private List trimTrailingZeroOnlyElements(List nodes) {
+ NodeListIterator iter = new NodeListIterator(nodes);
+
+ while (iter.hasNext()) {
+ // Keep a record of the current index so that we can reset it once we find an element that cannot match zero.
+ int lastIndex = iter.index();
+ // Skip past any question marks or quantifiers that are before the element. Remember, the node list is in reverse order.
+ iter.seekPastQuestionMarks();
+ iter.seekPastQuantifiers();
+ Node next = iter.peekNext();
+ // If the next element matches zero only, skip past it.
+ if (RegexUtils.matchesZeroOnly(next)) {
+ iter.next();
+ } else {
+ // Reset the index to the non-zero matching element.
+ iter.setIndex(lastIndex);
+ break;
+ }
+ }
+
+ // If no trailing zeros were seen, return the original list, otherwise return a sublist.
+ return iter.index() == 0 ? nodes : new ArrayList<>(nodes.subList(iter.index(), nodes.size()));
+ }
+
+ /**
+ * Return a list with all possible trailing zeros consolidated, and any elements made optional as needed.
+ *
+ * @param nodes
+ * the nodes to consolidate
+ * @return a list of consolidated nodes
+ */
+ private List consolidatePossibleTrailingZeros(List nodes) {
+ // List of consolidated nodes.
+ List consolidated = new ArrayList<>();
+ NodeListIterator iter = new NodeListIterator(nodes);
+
+ // Check if the pattern ends with '.+' or '.+?'. In this case, the '.+' must become a '.*' to allow for matching against numbers that had trailing zeros
+ // that were subsequently trimmed when encoded.
+ if (iter.hasNext()) {
+ int lastIndex = iter.index();
+ Node questionMark = iter.isNextQuestionMark() ? iter.next() : null;
+ Node quantifier = iter.isNextQuantifier() ? iter.next() : null;
+ Node next = iter.next();
+ // if the last element of the pattern is .+, convert it to .*.
+ if (next.getType() == NodeType.ANY_CHAR && quantifier != null && quantifier.getType() == NodeType.ONE_OR_MORE) {
+ if (questionMark != null) {
+ consolidated.add(questionMark);
+ }
+ consolidated.add(new ZeroOrMoreNode());
+ consolidated.add(new AnyCharNode());
+ } else {
+ // Otherwise reset the index to the initial index.
+ iter.setIndex(lastIndex);
+ }
+ }
+
+ // Iterate through each child.
+ while (iter.hasNext()) {
+ int lastIndex = iter.index();
+ iter.seekPastQuestionMarks();
+ iter.seekPastQuantifiers();
+
+ // Do not call next until we know the next node can match zero.
+ Node next = iter.peekNext();
+ // The next node can match zero. Call next, and call the specific consolidation method based on whether the node can match only zero, or other
+ // numbers.
+ if (RegexUtils.matchesZero(next)) {
+ if (RegexUtils.matchesZeroOnly(next)) {
+ iter.setIndex(lastIndex);
+ consolidated.addAll(consolidateTrailingMatchesZeroOnly(iter));
+ } else {
+ iter.setIndex(lastIndex);
+ consolidated.addAll(consolidateTrailingMatchesZero(iter));
+ }
+ } else {
+ // Reset the index to the non-zero matching element.
+ iter.setIndex(lastIndex);
+ break;
+ }
+ }
+
+ // Add the remaining nodes to the list to return.
+ while (iter.hasNext()) {
+ consolidated.add(iter.next());
+ }
+ return consolidated;
+ }
+
+ /**
+ * Consolidate any trailing zeros that can possibly match zero.
+ *
+ * @param iter
+ * the iterator
+ * @return the consolidated nodes.
+ */
+ private List consolidateTrailingMatchesZero(NodeListIterator iter) {
+ List nodes = new ArrayList<>();
+ while (iter.hasNext()) {
+ int lastIndex = iter.index();
+
+ // Skip past and capture the optional and quantifier for the node if present.
+ Node questionMark = iter.isNextQuestionMark() ? iter.next() : null;
+ Node quantifier = iter.isNextQuantifier() ? iter.next() : null;
+ Node next = iter.next();
+ // The next node can match zero. The first call to next should always return an element that can match zero, but not only zero.
+ if (RegexUtils.matchesZero(next)) {
+ // If the next node had a quantifier, evaluate the quantifier.
+ if (quantifier != null) {
+ switch (quantifier.getType()) {
+ case ZERO_OR_MORE:
+ case ONE_OR_MORE:
+ // In both the case of * or + for a leading zero, we must ensure that * is used in the final regex to allow for zero occurrences of
+ // the leading zero when matching.
+ // If the quantifier was followed by ?, append the ?.
+ if (questionMark != null) {
+ nodes.add(questionMark);
+ }
+ nodes.add(new ZeroOrMoreNode());
+ nodes.add(next);
+ break;
+ case REPETITION:
+ RepetitionNode repetition = (RepetitionNode) quantifier;
+ // If the repetition does not already allow for zero occurrences, we must create a new repetition quantifier that does so.
+ if (!RegexUtils.repetitionCanOccurZeroTimes(repetition)) {
+ if (RegexUtils.isNotRange(repetition)) {
+ // If the repetition is has the form {x}, replace it with {0,x}. For example, "[012]{3}" will become "[012]{0,3}".
+ // If the original quantifier was followed by ?, append it.
+ if (questionMark != null) {
+ nodes.add(questionMark);
+ }
+ nodes.add(RegexUtils.createRangeStartingFromZero(repetition));
+ nodes.add(next);
+ } else {
+ // If the repetition has the form {x,y}, where x is a value greater than zero, we must wrap the element and the repetition
+ // in an optional group to allow for it to occur either zero times, or x-y times. For example, "[012]{3,5}" will become
+ // "([012]{3,5})?". Create a group node with the element and repetition as its children.
+ GroupNode groupNode = new GroupNode();
+ groupNode.addChild(next);
+ groupNode.addChild(repetition);
+ // If the original quantifier was followed by ?, include it in the group.
+ if (questionMark != null) {
+ groupNode.addChild(questionMark);
+ }
+ // Make the group optional.
+ nodes.add(new QuestionMarkNode());
+ nodes.add(groupNode);
+ }
+ } else {
+ // The repetition allows for zero occurrences. No modifications need to be made.
+ if (questionMark != null) {
+ nodes.add(questionMark);
+ }
+ nodes.add(repetition);
+ nodes.add(next);
+ }
+ break;
+ default:
+ throw new IllegalArgumentException("Unsupported quantifier type: " + quantifier.getType());
+ }
+ } else {
+ // This is a single element. Make it optional.
+ nodes.add(new QuestionMarkNode());
+ nodes.add(next);
+ }
+
+ // If there are any elements after the current element that only match zero, consolidate then and add the result.
+ if (iter.hasNext()) {
+ lastIndex = iter.index();
+ iter.seekPastQuestionMarks();
+ iter.seekPastQuantifiers();
+ if (RegexUtils.matchesZeroOnly(iter.peekNext())) {
+ iter.setIndex(lastIndex);
+ nodes.addAll(consolidateTrailingMatchesZeroOnly(iter));
+ } else {
+ iter.setIndex(lastIndex);
+ }
+ }
+ } else {
+ // The next element cannot match zero. Nothing more to do. Reset the index to right before the non-zero element.
+ iter.setIndex(lastIndex);
+ break;
+ }
+ }
+ return nodes;
+ }
+
+ /**
+ * Consolidate the next consecutive elements that can only match zero.
+ *
+ * @param iter
+ * the iterator
+ * @return a list of the consolidated nodes
+ */
+ private List consolidateTrailingMatchesZeroOnly(NodeListIterator iter) {
+ // We need to track the minimum and maximum times a leading zero can occur.
+ int minZeroCount = 0;
+ int maxZeroCount = 0;
+
+ while (iter.hasNext()) {
+ int lastIndex = iter.index();
+ // Skip any question mark if present.
+ iter.seekPastQuestionMarks();
+ // Grab the quantifier if present.
+ Node quantifier = iter.isNextQuantifier() ? iter.next() : null;
+
+ // Do not call next until we've confirmed the next node only matches zero.
+ Node next = iter.peekNext();
+ if (RegexUtils.matchesZeroOnly(next)) {
+ // Explicitly call next now.
+ iter.next();
+ // If the zero has a quantifier, extract the quantifier range.
+ if (quantifier != null) {
+ Pair quantifierRange = RegexUtils.getQuantifierRange(quantifier);
+ // Increment the lower bound.
+ minZeroCount += quantifierRange.getLeft();
+ if (maxZeroCount != -1) {
+ // If the quantifier range has no defined upper bound, that is equivalent to unlimited. Set the max bound to -1 to ensure it is not
+ // changed.
+ if (quantifierRange.getRight() == null) {
+ maxZeroCount = -1;
+ } else {
+ // Otherwise increment the upper bound.
+ maxZeroCount += quantifierRange.getRight();
+ }
+ }
+ } else {
+ // The zero does not have a quantifier. Increment the min count by one, and increment the max count only if we have not yet determined that
+ // the max should be considered unlimited.
+ minZeroCount++;
+ if (maxZeroCount != -1) {
+ maxZeroCount++;
+ }
+ }
+ } else {
+ // If the next node does not only match zero, stop iterating.
+ iter.setIndex(lastIndex);
+ break;
+ }
+ }
+
+ List nodes = new ArrayList<>();
+ // Make the element optional.
+ nodes.add(new QuestionMarkNode());
+
+ // If the min and max are both 1, return 0?
+ if (minZeroCount == 1 && maxZeroCount == 1) {
+ nodes.add(new SingleCharNode(RegexConstants.ZERO));
+ } else {
+ // Otherwise we need return 0 followed by a quantifier inside an optional group.
+ GroupNode groupNode = new GroupNode();
+ groupNode.addChild(new SingleCharNode(RegexConstants.ZERO));
+
+ if (maxZeroCount == -1 && minZeroCount < 2) {
+ if (minZeroCount == 0) {
+ // Return (0*)?
+ groupNode.addChild(new ZeroOrMoreNode());
+ } else if (minZeroCount == 1) {
+ // Return (0+)?
+ groupNode.addChild(new OneOrMoreNode());
+ }
+ } else {
+ RepetitionNode repetition = new RepetitionNode();
+ if (minZeroCount == maxZeroCount) {
+ // Return (0{x})?
+ IntegerNode integer = new IntegerNode(minZeroCount);
+ repetition.addChild(integer);
+ } else {
+ // Return (0{x,y})? or (0{x,})? if unlimited max.
+ IntegerRangeNode integerRange = new IntegerRangeNode();
+ integerRange.setStart(minZeroCount);
+ if (maxZeroCount != -1) {
+ integerRange.setEnd(maxZeroCount);
+ }
+ repetition.addChild(integerRange);
+ }
+
+ groupNode.addChild(repetition);
+ }
+
+ nodes.add(groupNode);
+ }
+
+ return nodes;
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ZeroValueNormalizer.java b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ZeroValueNormalizer.java
new file mode 100644
index 00000000000..4351b38045f
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/normalizer/regex/visitor/ZeroValueNormalizer.java
@@ -0,0 +1,157 @@
+package datawave.data.normalizer.regex.visitor;
+
+import java.util.function.Consumer;
+
+import datawave.data.normalizer.regex.AlternationNode;
+import datawave.data.normalizer.regex.ExpressionNode;
+import datawave.data.normalizer.regex.Node;
+import datawave.data.normalizer.regex.NodeListIterator;
+import datawave.data.normalizer.regex.RegexConstants;
+import datawave.data.normalizer.regex.RegexUtils;
+import datawave.data.normalizer.regex.SingleCharNode;
+
+/**
+ * Implementation of {@link CopyVisitor} that:
+ *
+ * - Simplifies any positive non-simple number patterns that can only match zero to {@code "0"}.
+ * - Simplifies any negative non-simple number patterns that can only match zero to {@code "0"}.
+ * - Identifies any negative non-simple number patterns that can match zero, and adds a {@code "0"} alternation.
+ *
+ */
+public class ZeroValueNormalizer extends SubExpressionVisitor {
+
+ public static Node expand(Node node) {
+ if (node == null) {
+ return null;
+ }
+ ZeroValueNormalizer normalizer = new ZeroValueNormalizer();
+ return (Node) node.accept(normalizer, null);
+ }
+
+ @Override
+ protected Object visitSubExpression(Node node) {
+ // If the node represents a simple number, return a copy of it.
+ if (RegexUtils.isSimpleNumber(node)) {
+ return copy(node);
+ }
+
+ return normalizePattern(node, RegexUtils.isNegativeRegex(node));
+ }
+
+ private Node normalizePattern(Node node, boolean negative) {
+ // If the pattern can only match zero, simplify it to just '0'.
+ if (matchesZeroOnly(node, negative)) {
+ return createZeroCharExpression();
+ }
+ // If the pattern can match zero, add an alternation for '0'.
+ if (patternMatchesZero(node, negative)) {
+ AlternationNode alternation = new AlternationNode();
+ alternation.addChild(node);
+ alternation.addChild(createZeroCharExpression());
+ return new ExpressionNode(alternation);
+ }
+ // Otherwise the pattern can match numbers other than zero. Return a copy of it.
+ return copy(node);
+ }
+
+ /**
+ * Return whether the given pattern will only match 0.
+ *
+ * @param node
+ * the node
+ * @param negative
+ * whether the pattern is negative
+ * @return true if the pattern will only match 0, or false otherwise
+ */
+ private boolean matchesZeroOnly(Node node, boolean negative) {
+ // The minimum child count and index of the first non-minus sign node depends on whether the pattern is negative.
+ int minChildCount = negative ? 2 : 1;
+ int firstChild = negative ? 1 : 0;
+ if (node.getChildCount() == minChildCount) {
+ // If the minimum number of children is present, return whether it matches zero only.
+ return RegexUtils.matchesZeroOnly(node.getChildAt(firstChild));
+ } else {
+ // If there are multiple children, return whether all children match zero only.
+ NodeListIterator iter = node.getChildrenIterator();
+ // Skip past the minus sign if present.
+ if (negative) {
+ iter.next();
+ }
+ // Seek past all elements that only match zero.
+ seekPastAllZeroOnlyElements(iter);
+ return !iter.hasNext();
+ }
+ }
+
+ /**
+ * Return true if the given negative pattern can match zero.
+ *
+ * @param node
+ * the negative pattern
+ * @return true if the pattern can match 0, or false otherwise
+ */
+ private boolean patternMatchesZero(Node node, boolean negative) {
+ // If the child count is 2, there is only one node after the minus sign. Evaluate that by itself.
+ int minChildCount = negative ? 2 : 1;
+ int firstChild = negative ? 1 : 0;
+ if (node.getChildCount() == minChildCount) {
+ // If there is only one child, return whether it matches zero only.
+ return RegexUtils.matchesZero(node.getChildAt(firstChild));
+ } else {
+ // If there are multiple children, return whether all children match zero only.
+ NodeListIterator iter = node.getChildrenIterator();
+ if (negative) {
+ // Skip past the minus sign.
+ iter.next();
+ }
+ // Seek past all elements that only match zero.
+ seekPastAllZeroMatchingElements(iter);
+ return !iter.hasNext();
+ }
+ }
+
+ /**
+ * Return a new {@link ExpressionNode} that contains the expression {@code "0"}.
+ *
+ * @return the new node
+ */
+ private Node createZeroCharExpression() {
+ return new ExpressionNode(new SingleCharNode(RegexConstants.ZERO));
+ }
+
+ /**
+ * Seek past all consecutive elements that only match zero in the given iterator, including any after a decimal point.
+ *
+ * @param iterator
+ * the iterator
+ */
+ private void seekPastAllZeroOnlyElements(NodeListIterator iterator) {
+ seekPast(iterator, NodeListIterator::seekPastZeroOnlyElements);
+ }
+
+ /**
+ * Seek past all consecutive elements that can match zero in the given iterator, including any after a decimal point.
+ *
+ * @param iterator
+ * the iterator
+ */
+ private void seekPastAllZeroMatchingElements(NodeListIterator iterator) {
+ seekPast(iterator, NodeListIterator::seekPastZeroMatchingElements);
+ }
+
+ /**
+ * Seek past elements using the given delegate function. If a decimal point is present, seek past that as well.
+ *
+ * @param iter
+ * the iterator
+ * @param delegate
+ * the delegate function
+ */
+ private void seekPast(NodeListIterator iter, Consumer delegate) {
+ delegate.accept(iter);
+ if (iter.hasNext() && RegexUtils.isDecimalPoint(iter.peekNext())) {
+ iter.next();
+ delegate.accept(iter);
+ }
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/parser/GeometryParser.java b/core/utils/type-utils/src/main/java/datawave/data/parser/GeometryParser.java
new file mode 100644
index 00000000000..52795afcc1d
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/parser/GeometryParser.java
@@ -0,0 +1,22 @@
+package datawave.data.parser;
+
+import org.locationtech.jts.geom.Geometry;
+
+public abstract class GeometryParser implements Comparable {
+
+ public static final int DEFAULT_PRIORITY = 0;
+
+ public abstract Geometry parseGeometry(String geoString);
+
+ // Used for sorting
+ // Smaller numbers have higher priority
+ protected abstract int getPriority();
+
+ @Override
+ public int compareTo(GeometryParser other) {
+ int compare = this.getPriority() - other.getPriority();
+ if (compare == 0)
+ compare = this.getClass().getName().compareTo(other.getClass().getName());
+ return compare;
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/parser/WKBParser.java b/core/utils/type-utils/src/main/java/datawave/data/parser/WKBParser.java
new file mode 100644
index 00000000000..cc3bca67c7c
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/parser/WKBParser.java
@@ -0,0 +1,34 @@
+package datawave.data.parser;
+
+import org.locationtech.jts.geom.Geometry;
+import org.locationtech.jts.io.WKBReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.io.BaseEncoding;
+
+/**
+ * This class can be used to parse a geometry object from Base64 encoded well-known binary (WKB).
+ */
+public class WKBParser extends GeometryParser {
+
+ private static final Logger log = LoggerFactory.getLogger(WKBParser.class);
+
+ @Override
+ public Geometry parseGeometry(String geoString) {
+ Geometry geom = null;
+ try {
+ byte[] wkbBytes = BaseEncoding.base64().decode(geoString);
+ geom = new WKBReader().read(wkbBytes);
+ } catch (Exception e) {
+ if (log.isTraceEnabled())
+ log.trace("Cannot parse WKB geometry from [" + geoString + "]");
+ }
+ return geom;
+ }
+
+ @Override
+ protected int getPriority() {
+ return DEFAULT_PRIORITY + 1;
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/parser/WKTParser.java b/core/utils/type-utils/src/main/java/datawave/data/parser/WKTParser.java
new file mode 100644
index 00000000000..083a21ca417
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/parser/WKTParser.java
@@ -0,0 +1,39 @@
+package datawave.data.parser;
+
+import org.apache.commons.lang3.StringUtils;
+import org.locationtech.jts.geom.Geometry;
+import org.locationtech.jts.io.WKTReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class WKTParser extends GeometryParser {
+
+ private static final Logger log = LoggerFactory.getLogger(WKTParser.class);
+
+ private static final String[] geomTypes = new String[] {"GEOMETRY", "POINT", "LINESTRING", "POLYGON", "MULTIPOINT", "MULTILINESTRING", "MULTIPOLYGON",
+ "GEOMETRYCOLLECTION", "CIRCULARSTRING", "COMPOUNDCURVE", "CURVEPOLYGON", "MULTICURVE", "MULTISURFACE", "CURVE", "SURFACE", "POLYHEDRALSURFACE",
+ "TIN", "TRIANGLE"};
+ private static final String[] zGeomTypes = new String[geomTypes.length];
+
+ static {
+ for (int i = 0; i < geomTypes.length; i++)
+ zGeomTypes[i] = geomTypes[i] + " Z";
+ }
+
+ @Override
+ public Geometry parseGeometry(String geoString) {
+ Geometry geom = null;
+ try {
+ geom = new WKTReader().read(StringUtils.replaceEach(geoString, zGeomTypes, geomTypes));
+ } catch (Exception e) {
+ if (log.isTraceEnabled())
+ log.trace("Cannot parse WKT geometry from [" + geoString + "]");
+ }
+ return geom;
+ }
+
+ @Override
+ protected int getPriority() {
+ return DEFAULT_PRIORITY;
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/AbstractGeometryType.java b/core/utils/type-utils/src/main/java/datawave/data/type/AbstractGeometryType.java
new file mode 100644
index 00000000000..27970caccef
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/type/AbstractGeometryType.java
@@ -0,0 +1,99 @@
+package datawave.data.type;
+
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.locationtech.jts.geom.Geometry;
+import org.locationtech.jts.geom.Polygon;
+
+import datawave.data.normalizer.DiscreteIndexNormalizer;
+import datawave.data.normalizer.Normalizer;
+import datawave.data.normalizer.OneToManyNormalizer;
+import datawave.data.type.util.AbstractGeometry;
+
+/**
+ * The base GeoWave geometry type, which provides an implementation for the discrete index type interface.
+ *
+ * @param
+ * The underlying geometry type
+ */
+public abstract class AbstractGeometryType> extends BaseType implements DiscreteIndexType {
+
+ private static final long GEOMETRY_FACTORY_SIZE = 120;
+ private static final long ENVELOPE_SIZE = 45;
+ private static final long GEOMETRY_BASE_SIZE = ENVELOPE_SIZE + 20;
+ private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF + Sizer.REFERENCE + GEOMETRY_FACTORY_SIZE;
+
+ public AbstractGeometryType(Normalizer normalizer) {
+ super(normalizer);
+ }
+
+ @Override
+ public String incrementIndex(String index) {
+ return ((DiscreteIndexNormalizer) normalizer).incrementIndex(index);
+ }
+
+ @Override
+ public String decrementIndex(String index) {
+ return ((DiscreteIndexNormalizer) normalizer).decrementIndex(index);
+ }
+
+ @Override
+ @SuppressWarnings("unchecked")
+ public List discretizeRange(String beginIndex, String endIndex) {
+ return ((DiscreteIndexNormalizer) normalizer).discretizeRange(beginIndex, endIndex);
+ }
+
+ @Override
+ public boolean producesFixedLengthRanges() {
+ return ((DiscreteIndexNormalizer) normalizer).producesFixedLengthRanges();
+ }
+
+ @Override
+ public long sizeInBytes() {
+ long size = STATIC_SIZE + (2 * normalizedValue.length());
+
+ if (this instanceof OneToManyNormalizerType) {
+ List values = ((OneToManyNormalizerType>) this).getNormalizedValues();
+ size += 2 * values.stream().map(String::length).map(x -> x + Sizer.REFERENCE).reduce(Integer::sum).orElse(0);
+ }
+
+ List leafGeometries = new ArrayList<>();
+ LinkedList workingList = new LinkedList<>();
+ workingList.push(delegate.getJTSGeometry());
+
+ while (!workingList.isEmpty()) {
+ Geometry geom = workingList.pop();
+
+ if (geom.getNumGeometries() > 1) {
+ size += Sizer.OBJECT_OVERHEAD;
+
+ // push all the geometries to the working list
+ for (int i = 0; i < geom.getNumGeometries(); i++) {
+ workingList.push(geom.getGeometryN(i));
+ }
+ } else if (geom instanceof Polygon) {
+ size += 2 * Sizer.OBJECT_OVERHEAD + GEOMETRY_BASE_SIZE;
+
+ Polygon poly = (Polygon) geom;
+
+ // push all the exterior and interior rings to the working list
+ workingList.push(poly.getExteriorRing());
+ for (int i = 0; i < poly.getNumInteriorRing(); i++) {
+ workingList.push(poly.getInteriorRingN(i));
+ }
+
+ } else {
+ size += 3 * Sizer.OBJECT_OVERHEAD + GEOMETRY_BASE_SIZE;
+ leafGeometries.add(geom);
+ }
+ }
+
+ for (Geometry geom : leafGeometries) {
+ size += Sizer.ARRAY_OVERHEAD + Sizer.OBJECT_OVERHEAD + geom.getCoordinates().length * (3 * 8 + Sizer.OBJECT_OVERHEAD + Sizer.REFERENCE)
+ + Sizer.REFERENCE;
+ }
+ return size;
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/BaseType.java b/core/utils/type-utils/src/main/java/datawave/data/type/BaseType.java
new file mode 100644
index 00000000000..9ae9b5c1a55
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/type/BaseType.java
@@ -0,0 +1,175 @@
+package datawave.data.type;
+
+import java.io.Serializable;
+import java.util.Collection;
+import java.util.List;
+
+import datawave.data.normalizer.Normalizer;
+import datawave.webservice.query.data.ObjectSizeOf;
+
+public class BaseType & Serializable> implements Serializable, Type, ObjectSizeOf {
+
+ private static final long serialVersionUID = 5354270429891763693L;
+ private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF + Sizer.REFERENCE + Sizer.REFERENCE;
+
+ protected T delegate;
+ protected String normalizedValue;
+ protected final Normalizer normalizer;
+
+ public BaseType(String delegateString, Normalizer normalizer) {
+ this.normalizer = normalizer;
+ setDelegate(normalizer.denormalize(delegateString));
+ }
+
+ public BaseType(Normalizer normalizer) {
+ this.normalizer = normalizer;
+ }
+
+ public T getDelegate() {
+ return delegate;
+ }
+
+ public void setDelegateFromString(String in) {
+ setDelegate(normalizer.denormalize(in));
+ }
+
+ public void setDelegate(T delegate) {
+ this.delegate = delegate;
+ normalizeAndSetNormalizedValue(this.delegate);
+ }
+
+ public String getNormalizedValue() {
+ return normalizedValue;
+ }
+
+ @Override
+ public T denormalize() {
+ return this.delegate;
+ }
+
+ public void setNormalizedValue(String normalizedValue) {
+ this.normalizedValue = normalizedValue;
+ }
+
+ public int compareTo(Type o) {
+ return this.getDelegate().compareTo(o.getDelegate());
+ }
+
+ public String normalize() {
+ return normalizer.normalizeDelegateType(this.delegate);
+ }
+
+ public String normalize(String in) {
+ return normalizer.normalize(in);
+ }
+
+ public Collection expand(String in) {
+ return normalizer.expand(in);
+ }
+
+ public Collection expand() {
+ return normalizer.expand(this.delegate.toString());
+ }
+
+ public T denormalize(String in) {
+ return normalizer.denormalize(in);
+ }
+
+ @Override
+ public String normalizeRegex(String in) {
+ return normalizer.normalizeRegex(in);
+ }
+
+ @Override
+ public boolean normalizedRegexIsLossy(String in) {
+ return normalizer.normalizedRegexIsLossy(in);
+ }
+
+ @Override
+ public void normalizeAndSetNormalizedValue(T valueToNormalize) {
+ setNormalizedValue(normalizer.normalizeDelegateType(valueToNormalize));
+ }
+
+ public void validate() {
+ if (this.delegate == null || this.normalizedValue == null)
+ throw new IllegalArgumentException(this + " does not validate: " + delegate + "," + normalizedValue);
+ }
+
+ private int delegateHashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + ((delegate == null) ? 0 : delegate.hashCode());
+ return result;
+ }
+
+ private boolean delegateEquals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ @SuppressWarnings("unchecked")
+ BaseType other = (BaseType) obj;
+ if (delegate == null) {
+ if (other.delegate != null)
+ return false;
+ } else if (!delegate.equals(other.delegate))
+ return false;
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ if (delegate == null) {
+ // Use the concrete Type's full name to ensure that we don't get multiple
+ // instances of the same class (as Object#hashCode is based on virtual memory location)
+ return this.getClass().getName().hashCode();
+ } else {
+ return delegateHashCode();
+ }
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (delegate == null) {
+ Class> otherClz = o.getClass();
+
+ // Since Types are considered to be stateless,
+ // we can treat equality as the same class
+ if (otherClz.equals(this.getClass())) {
+ return true;
+ }
+ return false;
+ } else {
+ return delegateEquals(o);
+ }
+ }
+
+ @Override
+ public String getDelegateAsString() {
+ return toString();
+ }
+
+ @Override
+ public String toString() {
+ return delegate == null ? super.toString() : delegate.toString();
+ }
+
+ /**
+ * One string (normalizedValue) one unknown object (delegate) one normalizer (singleton reference) ref to object (4) normalizers will not be counted because
+ * they are singletons
+ *
+ * @return
+ */
+ @Override
+ public long sizeInBytes() {
+ long size = 0;
+ if (this instanceof OneToManyNormalizerType) {
+ List values = ((OneToManyNormalizerType>) this).getNormalizedValues();
+ size += values.stream().map(String::length).map(length -> 2 * length + ObjectSizeOf.Sizer.REFERENCE).reduce(Integer::sum).orElse(0);
+ }
+ size += STATIC_SIZE + (2 * normalizedValue.length()) + ObjectSizeOf.Sizer.getObjectSize(delegate);
+ return size;
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/DateType.java b/core/utils/type-utils/src/main/java/datawave/data/type/DateType.java
new file mode 100644
index 00000000000..2659ef0a865
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/type/DateType.java
@@ -0,0 +1,36 @@
+package datawave.data.type;
+
+import java.util.Date;
+
+import datawave.data.normalizer.Normalizer;
+
+public class DateType extends BaseType {
+
+ private static final long serialVersionUID = 936566410691643144L;
+ private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF + PrecomputedSizes.DATE_STATIC_REF + Sizer.REFERENCE;
+
+ public DateType() {
+ super(Normalizer.DATE_NORMALIZER);
+ }
+
+ public DateType(String dateString) {
+ super(Normalizer.DATE_NORMALIZER);
+ super.setDelegate(normalizer.denormalize(dateString));
+ }
+
+ @Override
+ public String getDelegateAsString() {
+ // the normalized form of the date preserves milliseconds
+ return normalizer.normalizeDelegateType(getDelegate());
+ }
+
+ /**
+ * One string, one date object, one reference to the normalizer
+ *
+ * @return
+ */
+ @Override
+ public long sizeInBytes() {
+ return STATIC_SIZE + (2 * normalizedValue.length());
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/DiscreteIndexType.java b/core/utils/type-utils/src/main/java/datawave/data/type/DiscreteIndexType.java
new file mode 100644
index 00000000000..8352ad14dd1
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/type/DiscreteIndexType.java
@@ -0,0 +1,54 @@
+package datawave.data.type;
+
+import java.util.List;
+
+/**
+ * Contains a collection of useful methods which can be used against index entries which are discrete and calculable.
+ *
+ * @param
+ */
+public interface DiscreteIndexType> extends Type {
+
+ /**
+ * Increments the given index to the next logical value.
+ *
+ * If producesFixedLengthRanges is true, and incrementIndex would cause the length of the index to change, the original index will be returned.
+ *
+ * @param index
+ * @return an incremented index
+ */
+ String incrementIndex(String index);
+
+ /**
+ * Decrements the given index to the previous logical value.
+ *
+ * If producesFixedLengthRanges is true, and decrementIndex would cause the length of the index to change, the original index will be returned.
+ *
+ * @param index
+ * @return a decremented index
+ */
+ String decrementIndex(String index);
+
+ /**
+ * Returns a list of all discrete values between begin and end.
+ *
+ * If producesFixedLengthRanges is true, the returned values will be of the same length as begin and end.
+ *
+ * If producesFixedLengthRanges is true, and begin and end are of different lengths, the original range will be returned.
+ *
+ * If begin does not come before end, an empty list will be returned.
+ *
+ * @param beginIndex
+ * @param endIndex
+ * @return a list of the discrete index values between begin and end
+ */
+ List discretizeRange(String beginIndex, String endIndex);
+
+ /**
+ * Indicates whether or not the ranges against the given indices will be of fixed length. That is to say, whether or not all index values within a given
+ * range will have the same string length. This is an important characteristic which enables composite ranges to be created.
+ *
+ * @return whether query ranges against these values will be of fixed length
+ */
+ boolean producesFixedLengthRanges();
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/GeoLatType.java b/core/utils/type-utils/src/main/java/datawave/data/type/GeoLatType.java
new file mode 100644
index 00000000000..e292c000f8c
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/type/GeoLatType.java
@@ -0,0 +1,23 @@
+package datawave.data.type;
+
+import datawave.data.normalizer.Normalizer;
+
+public class GeoLatType extends BaseType {
+
+ private static final long serialVersionUID = -2775239290833908032L;
+ private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF * 2 + Sizer.REFERENCE;
+
+ public GeoLatType() {
+ super(Normalizer.GEO_LAT_NORMALIZER);
+ }
+
+ /**
+ * Two String + normalizer reference
+ *
+ * @return
+ */
+ @Override
+ public long sizeInBytes() {
+ return STATIC_SIZE + (2 * normalizedValue.length()) + (2 * delegate.length());
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/GeoLonType.java b/core/utils/type-utils/src/main/java/datawave/data/type/GeoLonType.java
new file mode 100644
index 00000000000..2d34ff553f1
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/type/GeoLonType.java
@@ -0,0 +1,23 @@
+package datawave.data.type;
+
+import datawave.data.normalizer.Normalizer;
+
+public class GeoLonType extends BaseType {
+
+ private static final long serialVersionUID = 8912983433360105604L;
+ private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF * 2 + Sizer.REFERENCE;
+
+ public GeoLonType() {
+ super(Normalizer.GEO_LON_NORMALIZER);
+ }
+
+ /**
+ * Two String + normalizer reference
+ *
+ * @return
+ */
+ @Override
+ public long sizeInBytes() {
+ return STATIC_SIZE + (2 * normalizedValue.length()) + (2 * delegate.length());
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/GeoType.java b/core/utils/type-utils/src/main/java/datawave/data/type/GeoType.java
new file mode 100644
index 00000000000..85d1419603c
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/type/GeoType.java
@@ -0,0 +1,23 @@
+package datawave.data.type;
+
+import datawave.data.normalizer.Normalizer;
+
+public class GeoType extends BaseType {
+
+ private static final long serialVersionUID = 8429780512238258642L;
+ private static final long STATIC_SIZE = PrecomputedSizes.STRING_STATIC_REF * 2 + Sizer.REFERENCE;
+
+ public GeoType() {
+ super(Normalizer.GEO_NORMALIZER);
+ }
+
+ /**
+ * Two String + normalizer reference
+ *
+ * @return
+ */
+ @Override
+ public long sizeInBytes() {
+ return STATIC_SIZE + (2 * normalizedValue.length()) + (2 * delegate.length());
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/GeometryType.java b/core/utils/type-utils/src/main/java/datawave/data/type/GeometryType.java
new file mode 100644
index 00000000000..23e6e69ee81
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/type/GeometryType.java
@@ -0,0 +1,43 @@
+package datawave.data.type;
+
+import java.util.List;
+
+import datawave.data.normalizer.Normalizer;
+import datawave.data.normalizer.OneToManyNormalizer;
+import datawave.data.type.util.Geometry;
+
+/**
+ * Provides inclusive support for all geometry types. OneToManyNormalizer support is needed as lines and polygons are likely to produce multiple normalized
+ * values during ingest.
+ */
+public class GeometryType extends AbstractGeometryType implements OneToManyNormalizerType {
+
+ protected List normalizedValues;
+
+ public GeometryType() {
+ super(Normalizer.GEOMETRY_NORMALIZER);
+ }
+
+ public List normalizeToMany(String in) {
+ return ((OneToManyNormalizer) normalizer).normalizeToMany(in);
+ }
+
+ public void setNormalizedValues(List normalizedValues) {
+ this.normalizedValues = normalizedValues;
+ setNormalizedValue(this.normalizedValues.toString());
+ }
+
+ @Override
+ public void normalizeAndSetNormalizedValue(Geometry valueToNormalize) {
+ setNormalizedValues(((OneToManyNormalizer) normalizer).normalizeDelegateTypeToMany(valueToNormalize));
+ }
+
+ public List getNormalizedValues() {
+ return normalizedValues;
+ }
+
+ @Override
+ public boolean expandAtQueryTime() {
+ return false;
+ }
+}
diff --git a/core/utils/type-utils/src/main/java/datawave/data/type/HexStringType.java b/core/utils/type-utils/src/main/java/datawave/data/type/HexStringType.java
new file mode 100644
index 00000000000..6528a5ba8a6
--- /dev/null
+++ b/core/utils/type-utils/src/main/java/datawave/data/type/HexStringType.java
@@ -0,0 +1,23 @@
+package datawave.data.type;
+
+import datawave.data.normalizer.Normalizer;
+
+public class HexStringType extends BaseType