Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more descriptive types to the data dictionary #31

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions api/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
<datawave.webservice.namespace>http://webservice.datawave.nsa/v1</datawave.webservice.namespace>
<version.datawave.base-rest-responses>4.0.0</version.datawave.base-rest-responses>
<version.datawave.metadata-utils>4.0.5</version.datawave.metadata-utils>
<version.datawave.type-utils>3.0.4</version.datawave.type-utils>
<version.guava>31.1-jre</version.guava>
</properties>
<dependencyManagement>
Expand Down Expand Up @@ -59,6 +60,11 @@
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>gov.nsa.datawave.microservice</groupId>
<artifactId>type-utils</artifactId>
<version>${version.datawave.type-utils}</version>
</dependency>
</dependencies>
</dependencyManagement>
<dependencies>
Expand All @@ -74,6 +80,10 @@
<groupId>gov.nsa.datawave.microservice</groupId>
<artifactId>metadata-utils</artifactId>
</dependency>
<dependency>
<groupId>gov.nsa.datawave.microservice</groupId>
<artifactId>type-utils</artifactId>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ public DataDictionaryController(DataDictionaryProperties dataDictionaryConfigura
this.dataDictionary = dataDictionary;
this.responseObjectFactory = responseObjectFactory;
this.accumuloConnectionService = accumloConnectionService;
dataDictionary.setNormalizationMap(dataDictionaryConfiguration.getNormalizerMap());
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@

public interface DataDictionary<META extends MetadataFieldBase<META,DESC>,DESC extends DescriptionBase<DESC>,FIELD extends DictionaryFieldBase<FIELD,DESC>> {

Map<String,String> getNormalizationMap();

void setNormalizationMap(Map<String,String> normalizationMap);

Collection<META> getFields(Connection connectionConfig, Collection<String> dataTypeFilters, int numThreads) throws Exception;

void setDescription(Connection connectionConfig, FIELD description) throws Exception;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ public class DataDictionaryImpl implements DataDictionary<DefaultMetadataField,D
private final ResponseObjectFactory<DefaultDescription,DefaultDataDictionary,DefaultMetadataField,DefaultDictionaryField,DefaultFields> responseObjectFactory;
private final MetadataHelperFactory metadataHelperFactory;
private final MetadataDescriptionsHelperFactory<DefaultDescription> metadataDescriptionsHelperFactory;
private Map<String,String> normalizationMap = Maps.newHashMap();

public DataDictionaryImpl(MarkingFunctions markingFunctions,
ResponseObjectFactory<DefaultDescription,DefaultDataDictionary,DefaultMetadataField,DefaultDictionaryField,DefaultFields> responseObjectFactory,
Expand All @@ -47,16 +46,6 @@ public DataDictionaryImpl(MarkingFunctions markingFunctions,
this.metadataDescriptionsHelperFactory = metadataDescriptionsHelperFactory;
}

@Override
public Map<String,String> getNormalizationMap() {
return normalizationMap;
}

@Override
public void setNormalizationMap(Map<String,String> normalizationMap) {
this.normalizationMap = normalizationMap;
}

/**
* Retrieve metadata fields from the specified metadata table, aggregated by field name and data type.
*
Expand All @@ -76,8 +65,7 @@ public void setNormalizationMap(Map<String,String> normalizationMap) {
@Override
public Collection<DefaultMetadataField> getFields(Connection connectionConfig, Collection<String> dataTypeFilters, int numThreads) throws Exception {
Map<String,String> aliases = getAliases(connectionConfig);
DefaultMetadataFieldScanner scanner = new DefaultMetadataFieldScanner(markingFunctions, responseObjectFactory, normalizationMap, connectionConfig,
numThreads);
DefaultMetadataFieldScanner scanner = new DefaultMetadataFieldScanner(markingFunctions, responseObjectFactory, connectionConfig, numThreads);
return scanner.getFields(aliases, dataTypeFilters);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package datawave.microservice.metadata;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.text.SimpleDateFormat;
import java.time.Instant;
import java.time.LocalDate;
Expand All @@ -23,13 +24,15 @@
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.iterators.user.WholeRowIterator;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Maps;

import datawave.data.ColumnFamilyConstants;
import datawave.data.type.util.TypePrettyNameSupplier;
import datawave.marking.MarkingFunctions;
import datawave.microservice.Connection;
import datawave.microservice.dictionary.config.ResponseObjectFactory;
Expand All @@ -47,16 +50,14 @@ public class DefaultMetadataFieldScanner {

private final MarkingFunctions markingFunctions;
private final ResponseObjectFactory<DefaultDescription,?,DefaultMetadataField,?,?> responseObjectFactory;
private final Map<String,String> normalizationMap;
private final Connection connectionConfig;
private final int numThreads;

public DefaultMetadataFieldScanner(MarkingFunctions markingFunctions,
ResponseObjectFactory<DefaultDescription,DefaultDataDictionary,DefaultMetadataField,DefaultDictionaryField,DefaultFields> responseObjectFactory,
Map<String,String> normalizationMap, Connection connectionConfig, int numThreads) {
Connection connectionConfig, int numThreads) {
this.markingFunctions = markingFunctions;
this.responseObjectFactory = responseObjectFactory;
this.normalizationMap = normalizationMap;
this.connectionConfig = connectionConfig;
this.numThreads = numThreads;
}
Expand Down Expand Up @@ -266,13 +267,38 @@ private void setDescriptions() throws MarkingFunctions.Exception {
currField.getDescriptions().add(description);
}

// Set the normalized type for the current {@link DefaultMetadataField}. If no normalized version can be found for the type, the type will default to
// "Unknown".
// Ensures first letter of the type is always capitalized.
// Ensures redundant terminology like 'Type' is removed.
private String determineUnknownType(String unknown) {
String[] unknownType = unknown.split("\\.");
return StringUtils.capitalize(unknownType[unknownType.length - 1].replace("Type", ""));
}

// Set the normalized type for the current {@link DefaultMetadataField}.
private void setType() {
int nullPos = currColumnQualifier.indexOf('\0');
String type = currColumnQualifier.substring(nullPos + 1);
String normalizedType = normalizationMap.get(type);
currField.addType(normalizedType != null ? normalizedType : "Unknown");
/*
* Attempt to get a new instance of the class within 'type'. This will be used to determine what value(s) should be placed into the 'Types' field in
* the data dictionary.
*
* Use the value returned from getDataDictionaryTypeValue when: The class can be found AND it is an instance of TypePrettyNameSupplier AND
* getDataDictionaryTypeValue is not null.
*
* Use the DEFAULT_DATA_DICTIONARY_NAME provided in TypePrettyNameSupplier when: The class is found but getDataDictionaryTypeValue is null OR the
* class is found but is not an instance of TypePrettyNameSupplier.
*
* Use the value from determineUnknownType when: An exception occurs
*/
try {
Object typeObject = Class.forName(type).getDeclaredConstructor().newInstance();
currField.addType(typeObject instanceof TypePrettyNameSupplier && ((TypePrettyNameSupplier) typeObject).getDataDictionaryTypeValue() != null
? ((TypePrettyNameSupplier) typeObject).getDataDictionaryTypeValue()
: TypePrettyNameSupplier.DEFAULT_DATA_DICTIONARY_NAME);
} catch (RuntimeException | ClassNotFoundException | NoSuchMethodException | InvocationTargetException | InstantiationException
| IllegalAccessException e) {
currField.addType(determineUnknownType(type));
}
}

// Set the last updated date for the current {@link DefaultMetadataField} based on the timestamp of the current entry.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.time.ZonedDateTime;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
Expand Down Expand Up @@ -77,16 +78,12 @@ public void setUp() throws Exception {
connector.tableOperations().create(MODEL_TABLE);
populateMetadataTable();

Map<String,String> normalizerMapping = new HashMap<>();
normalizerMapping.put("datawave.data.type.LcNoDiacriticsType", "Text");
normalizerMapping.put("datawave.data.type.NumberType", "Number");

Connection connectionConfig = new Connection();
connectionConfig.setAccumuloClient(connector);
connectionConfig.setMetadataTable(METADATA_TABLE);
connectionConfig.setAuths(AUTHS);

scanner = new DefaultMetadataFieldScanner(new MarkingFunctions.Default(), RESPONSE_OBJECT_FACTORY, normalizerMapping, connectionConfig, 1);
scanner = new DefaultMetadataFieldScanner(new MarkingFunctions.Default(), RESPONSE_OBJECT_FACTORY, connectionConfig, 1);
}

@Test
Expand All @@ -109,16 +106,24 @@ public void whenRetrievingFields_givenNoDataTypeFilters_shouldReturnUnfilteredRe
contributorId.setDescription(Collections.singleton(createDescription("ContributorId Description")));
contributorId.setLastUpdated(DATE);

DefaultMetadataField ipAddress = new DefaultMetadataField();
ipAddress.setFieldName("IP_ADDRESS");
ipAddress.setDataType("csv");
ipAddress.setForwardIndexed(true);
ipAddress.setTypes(Collections.singletonList("IP Address"));
ipAddress.setDescription(Collections.singleton(createDescription("IpAddress Description")));
ipAddress.setLastUpdated(DATE);

DefaultMetadataField name = new DefaultMetadataField();
name.setFieldName("NAME");
name.setDataType("tvmaze");
name.setForwardIndexed(true);
name.setReverseIndexed(true);
name.setTypes(Collections.singletonList("Unknown"));
name.setTypes(Collections.singletonList("Cat"));
name.setLastUpdated(DATE);

Collection<DefaultMetadataField> fields = scanner.getFields(Collections.emptyMap(), Collections.emptySet());
assertThat(fields).containsExactlyInAnyOrder(barField, contributorId, name);
assertThat(fields).containsExactlyInAnyOrder(barField, contributorId, ipAddress, name);
}

@Test
Expand All @@ -141,11 +146,19 @@ public void whenRetrievingFields_givenDataTypeFilters_shouldReturnFilteredResult
contributorId.setDescription(Collections.singleton(createDescription("ContributorId Description")));
contributorId.setLastUpdated(DATE);

DefaultMetadataField ipAddress = new DefaultMetadataField();
ipAddress.setFieldName("IP_ADDRESS");
ipAddress.setDataType("csv");
ipAddress.setForwardIndexed(true);
ipAddress.setTypes(Collections.singletonList("IP Address"));
ipAddress.setDescription(Collections.singleton(createDescription("IpAddress Description")));
ipAddress.setLastUpdated(DATE);

Set<String> dataTypeFilters = new HashSet<>();
dataTypeFilters.add("csv");
dataTypeFilters.add("enwiki");
Collection<DefaultMetadataField> fields = scanner.getFields(Collections.emptyMap(), dataTypeFilters);
assertThat(fields).containsExactlyInAnyOrder(barField, contributorId);
assertThat(fields).containsExactlyInAnyOrder(barField, contributorId, ipAddress);
}

@Test
Expand All @@ -170,19 +183,29 @@ public void whenRetrievingFields_givenAliases_shouldReturnResultsWithAliases() t
contributorId.setDescription(Collections.singleton(createDescription("ContributorId Description")));
contributorId.setLastUpdated(DATE);

DefaultMetadataField ipAddress = new DefaultMetadataField();
ipAddress.setFieldName("ip_address");
ipAddress.setInternalFieldName("IP_ADDRESS");
ipAddress.setDataType("csv");
ipAddress.setForwardIndexed(true);
ipAddress.setTypes(Collections.singletonList("IP Address"));
ipAddress.setDescription(Collections.singleton(createDescription("IpAddress Description")));
ipAddress.setLastUpdated(DATE);

DefaultMetadataField name = new DefaultMetadataField();
name.setFieldName("NAME");
name.setDataType("tvmaze");
name.setForwardIndexed(true);
name.setReverseIndexed(true);
name.setTypes(Collections.singletonList("Unknown"));
name.setTypes(Collections.singletonList("Cat"));
name.setLastUpdated(DATE);

Map<String,String> aliases = new HashMap<>();
aliases.put("BAR_FIELD", "bar_field_alias");
aliases.put("CONTRIBUTOR_ID", "contributor_id_alias");
aliases.put("IP_ADDRESS", "ip_address");
Collection<DefaultMetadataField> fields = scanner.getFields(aliases, Collections.emptySet());
assertThat(fields).containsExactlyInAnyOrder(barField, contributorId, name);
assertThat(fields).containsExactlyInAnyOrder(barField, contributorId, ipAddress, name);
}

private void populateMetadataTable() throws TableNotFoundException, MutationsRejectedException {
Expand All @@ -201,16 +224,24 @@ private void populateMetadataTable() throws TableNotFoundException, MutationsRej
contributorId.put(new Text(ColumnFamilyConstants.COLF_DESC), new Text("enwiki"), new ColumnVisibility("PRIVATE"), TIMESTAMP,
new Value("ContributorId Description"));

Mutation ipAddress = new Mutation(new Text("IP_ADDRESS"));
ipAddress.put(new Text(ColumnFamilyConstants.COLF_E), new Text("csv"), TIMESTAMP, new Value());
ipAddress.put(new Text(ColumnFamilyConstants.COLF_I), new Text("csv"), TIMESTAMP, new Value());
ipAddress.put(new Text(ColumnFamilyConstants.COLF_T), new Text("csv\0datawave.data.type.IpAddressType"), TIMESTAMP, new Value());
ipAddress.put(new Text(ColumnFamilyConstants.COLF_DESC), new Text("csv"), new ColumnVisibility("PRIVATE"), TIMESTAMP,
new Value("IpAddress Description"));

Mutation name = new Mutation(new Text("NAME"));
name.put(new Text(ColumnFamilyConstants.COLF_E), new Text("tvmaze"), TIMESTAMP, new Value());
name.put(new Text(ColumnFamilyConstants.COLF_I), new Text("tvmaze"), TIMESTAMP, new Value());
name.put(new Text(ColumnFamilyConstants.COLF_RI), new Text("tvmaze"), TIMESTAMP, new Value());
name.put(new Text(ColumnFamilyConstants.COLF_T), new Text("tvmaze\0not.a.known.type"), TIMESTAMP, new Value());
name.put(new Text(ColumnFamilyConstants.COLF_T), new Text("tvmaze\0datawave.data.type.catType"), TIMESTAMP, new Value());

BatchWriterConfig bwConfig = new BatchWriterConfig().setMaxMemory(10L).setMaxLatency(1, TimeUnit.SECONDS).setMaxWriteThreads(1);
BatchWriter writer = connector.createBatchWriter(METADATA_TABLE, bwConfig);
writer.addMutation(barField);
writer.addMutation(contributorId);
writer.addMutation(ipAddress);
writer.addMutation(name);
writer.flush();
writer.close();
Expand Down
Loading