Skip to content

Commit

Permalink
updated to v208, added Country
Browse files Browse the repository at this point in the history
-updated to API v208,
-added Country classification model,
-added probability columns in output
  • Loading branch information
namsor committed Jan 30, 2020
1 parent 62ff353 commit 2c56c21
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 5 deletions.
1 change: 1 addition & 0 deletions nb-configuration.xml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Without this configuration present, some functionality in the IDE may be limited
<configurations>
<configuration id="run-diaspora-id" profiles=""/>
<configuration id="run-diaspora-id_clone" profiles=""/>
<configuration id="run-diaspora-id_clone" profiles="run-country-idfnln"/>
</configurations>
</config-data>
</project-shared-configuration>
49 changes: 46 additions & 3 deletions nbactions-run-diaspora-id_clone.xml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
<goal>org.codehaus.mojo:exec-maven-plugin:1.2.1:exec</goal>
</goals>
<properties>
<exec.args>-Xmx12500m -classpath %classpath com.namsor.tools.NamSorTools -apiKey ed051bf58dcda88ee57ddfe6feddde9d -r -uid -header -f fnlngeo -i D:\Projects\georgia\companiesHouse\persons-with-significant-control-snapshot-2019-05-14_idfnlngeo.txt -service gender</exec.args>
<exec.args>-Xmx12500m -classpath %classpath com.namsor.tools.NamSorTools -apiKey e77eee3fd16f82751912d27cdbacf7ec -w -header -uid -f fnln -i "D:\Sync\Dropbox\0_NamSor_SAS\NamSor_com\NamSor SAS\10_clients\HEC.CA\20200108_HEC_ca_idfnln.txt" -service country</exec.args>
<exec.executable>java</exec.executable>
</properties>
</action>
Expand All @@ -24,7 +24,7 @@
<goal>org.codehaus.mojo:exec-maven-plugin:1.2.1:exec</goal>
</goals>
<properties>
<exec.args>-Xdebug -Xrunjdwp:transport=dt_socket,server=n,address=${jpda.address} -Xmx12500m -classpath %classpath com.namsor.tools.NamSorTools -apiKey ed051bf58dcda88ee57ddfe6feddde9d -r -uid -header -f fnlngeo -i D:\Projects\georgia\companiesHouse\persons-with-significant-control-snapshot-2019-05-14_idfnlngeo.txt -service gender</exec.args>
<exec.args>-Xdebug -Xrunjdwp:transport=dt_socket,server=n,address=${jpda.address} -Xmx12500m -classpath %classpath com.namsor.tools.NamSorTools -apiKey e77eee3fd16f82751912d27cdbacf7ec -w -header -uid -f fnln -i "D:\Sync\Dropbox\0_NamSor_SAS\NamSor_com\NamSor SAS\10_clients\HEC.CA\20200108_HEC_ca_idfnln.txt" -service country</exec.args>
<jpda.listen>true</jpda.listen>
<exec.executable>java</exec.executable>
</properties>
Expand All @@ -39,7 +39,50 @@
<goal>org.codehaus.mojo:exec-maven-plugin:1.2.1:exec</goal>
</goals>
<properties>
<exec.args>-Xmx12500m -classpath %classpath com.namsor.tools.NamSorTools -apiKey ed051bf58dcda88ee57ddfe6feddde9d -r -uid -header -f fnlngeo -i D:\Projects\georgia\companiesHouse\persons-with-significant-control-snapshot-2019-05-14_idfnlngeo.txt -service gender</exec.args>
<exec.args>-Xmx12500m -classpath %classpath com.namsor.tools.NamSorTools -apiKey e77eee3fd16f82751912d27cdbacf7ec -w -header -uid -f fnln -i "D:\Sync\Dropbox\0_NamSor_SAS\NamSor_com\NamSor SAS\10_clients\HEC.CA\20200108_HEC_ca_idfnln.txt" -service country</exec.args>
<exec.executable>java</exec.executable>
</properties>
</action>
<action>
<actionName>run</actionName>
<packagings>
<packaging>jar</packaging>
</packagings>
<goals>
<goal>process-classes</goal>
<goal>org.codehaus.mojo:exec-maven-plugin:1.2.1:exec</goal>
</goals>
<properties>
<exec.args>-Xmx12500m -classpath %classpath com.namsor.tools.NamSorTools -apiKey e77eee3fd16f82751912d27cdbacf7ec -w -header -uid -f fnln -i "D:\Sync\Dropbox\0_NamSor_SAS\NamSor_com\NamSor SAS\10_clients\HEC.CA\20200108_HEC_ca_idfnln.txt" -service country</exec.args>
<exec.executable>java</exec.executable>
</properties>
</action>
<action>
<actionName>debug</actionName>
<packagings>
<packaging>jar</packaging>
</packagings>
<goals>
<goal>process-classes</goal>
<goal>org.codehaus.mojo:exec-maven-plugin:1.2.1:exec</goal>
</goals>
<properties>
<exec.args>-Xdebug -Xrunjdwp:transport=dt_socket,server=n,address=${jpda.address} -Xmx12500m -classpath %classpath com.namsor.tools.NamSorTools -apiKey e77eee3fd16f82751912d27cdbacf7ec -w -header -uid -f fnln -i "D:\Sync\Dropbox\0_NamSor_SAS\NamSor_com\NamSor SAS\10_clients\HEC.CA\20200108_HEC_ca_idfnln.txt" -service country</exec.args>
<jpda.listen>true</jpda.listen>
<exec.executable>java</exec.executable>
</properties>
</action>
<action>
<actionName>profile</actionName>
<packagings>
<packaging>jar</packaging>
</packagings>
<goals>
<goal>process-classes</goal>
<goal>org.codehaus.mojo:exec-maven-plugin:1.2.1:exec</goal>
</goals>
<properties>
<exec.args>-Xmx12500m -classpath %classpath com.namsor.tools.NamSorTools -apiKey e77eee3fd16f82751912d27cdbacf7ec -w -header -uid -f fnln -i "D:\Sync\Dropbox\0_NamSor_SAS\NamSor_com\NamSor SAS\10_clients\HEC.CA\20200108_HEC_ca_idfnln.txt" -service country</exec.args>
<exec.executable>java</exec.executable>
</properties>
</action>
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>namsor-sdk2</artifactId>
<version>2.0.7</version>
<version>2.0.8</version>
</dependency>
</dependencies>
<build>
Expand Down
40 changes: 39 additions & 1 deletion src/main/java/com/namsor/tools/NamSorTools.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import com.namsor.sdk2.model.BatchFirstLastNameUSRaceEthnicityOut;
import com.namsor.sdk2.model.BatchPersonalNameGenderedOut;
import com.namsor.sdk2.model.BatchPersonalNameGeoIn;
import com.namsor.sdk2.model.BatchPersonalNameGeoOut;
import com.namsor.sdk2.model.BatchPersonalNameIn;
import com.namsor.sdk2.model.BatchPersonalNameParsedOut;
import com.namsor.sdk2.model.FirstLastNameDiasporaedOut;
Expand All @@ -27,6 +28,7 @@
import com.namsor.sdk2.model.FirstLastNameUSRaceEthnicityOut;
import com.namsor.sdk2.model.PersonalNameGenderedOut;
import com.namsor.sdk2.model.PersonalNameGeoIn;
import com.namsor.sdk2.model.PersonalNameGeoOut;
import com.namsor.sdk2.model.PersonalNameIn;
import com.namsor.sdk2.model.PersonalNameParsedOut;
import java.io.BufferedReader;
Expand Down Expand Up @@ -97,26 +99,30 @@ public class NamSorTools {
private static final String SERVICE_NAME_PARSE = "parse";
private static final String SERVICE_NAME_GENDER = "gender";
private static final String SERVICE_NAME_ORIGIN = "origin";
private static final String SERVICE_NAME_COUNTRY = "country";
private static final String SERVICE_NAME_DIASPORA = "diaspora";
private static final String SERVICE_NAME_USRACEETHNICITY = "usraceethnicity";

private static final String[] SERVICES = {
SERVICE_NAME_PARSE,
SERVICE_NAME_GENDER,
SERVICE_NAME_ORIGIN,
SERVICE_NAME_COUNTRY,
SERVICE_NAME_DIASPORA,
SERVICE_NAME_USRACEETHNICITY
};

private static final String[] OUTPUT_DATA_PARSE_HEADER = {"firstNameParsed", "lastNameParsed", "nameParserType", "nameParserTypeAlt", "nameParserTypeScore", "script"};
private static final String[] OUTPUT_DATA_GENDER_HEADER = {"likelyGender", "likelyGenderScore", "probabilityCalibrated", "genderScale", "script"};
private static final String[] OUTPUT_DATA_ORIGIN_HEADER = {"countryOrigin", "countryOriginAlt", "probabilityCalibrated", "probabilityCalibratedAlt", "countryOriginScore", "script"};
private static final String[] OUTPUT_DATA_COUNTRY_HEADER = {"country", "countryAlt", "probabilityCalibrated", "probabilityCalibratedAlt", "countryScore", "script"};
private static final String[] OUTPUT_DATA_DIASPORA_HEADER = {"ethnicity", "ethnicityAlt", "ethnicityScore", "script"};
private static final String[] OUTPUT_DATA_USRACEETHNICITY_HEADER = {"raceEthnicity", "raceEthnicityAlt", "probabilityCalibrated", "probabilityCalibratedAlt", "raceEthnicityScore", "script"};
private static final String[][] OUTPUT_DATA_HEADERS = {
OUTPUT_DATA_PARSE_HEADER,
OUTPUT_DATA_GENDER_HEADER,
OUTPUT_DATA_ORIGIN_HEADER,
OUTPUT_DATA_COUNTRY_HEADER,
OUTPUT_DATA_DIASPORA_HEADER,
OUTPUT_DATA_USRACEETHNICITY_HEADER
};
Expand Down Expand Up @@ -278,7 +284,7 @@ public static void main(String[] args) {

Option service = Option.builder("service").argName("service")
.hasArg(true)
.desc("service : parse / gender / origin / diaspora / usraceethnicity")
.desc("service : parse / gender / origin / country / diaspora / usraceethnicity")
.longOpt("endpoint")
.required(true)
.build();
Expand Down Expand Up @@ -610,6 +616,28 @@ private Map<String, PersonalNameGenderedOut> processGenderFull(List<PersonalName
return result;
}

private Map<String, PersonalNameGeoOut> processCountry(List<PersonalNameIn> names) throws ApiException, IOException {
Map<String, PersonalNameGeoOut> result = new HashMap();
BatchPersonalNameIn body = new BatchPersonalNameIn();
body.setPersonalNames(names);
BatchPersonalNameGeoOut countried = api.countryBatch(body);
for (PersonalNameGeoOut personalName : countried.getPersonalNames()) {
result.put(personalName.getId(), personalName);
}
return result;
}

private Map<String, PersonalNameGeoOut> processCountryAdapted(List<FirstLastNameIn> names_) throws ApiException, IOException {
List<PersonalNameIn> names = new ArrayList();
for (FirstLastNameIn name : names_) {
PersonalNameIn adapted = new PersonalNameIn();
adapted.setId(name.getId());
adapted.setName(name.getFirstName()+" "+name.getLastName());
names.add(adapted);
}
return processCountry(names);
}

private Map<String, PersonalNameGenderedOut> processGenderFullGeo(List<PersonalNameGeoIn> names) throws ApiException, IOException {
Map<String, PersonalNameGenderedOut> result = new HashMap();
BatchPersonalNameGeoIn body = new BatchPersonalNameGeoIn();
Expand Down Expand Up @@ -673,6 +701,9 @@ private void processData(String service, String[] outputHeaders, Writer writer,
} else if (service.equals(SERVICE_NAME_GENDER)) {
Map<String, FirstLastNameGenderedOut> genders = processGender(new ArrayList(firstLastNamesIn.values()));
append(writer, outputHeaders, firstLastNamesIn, genders, softwareNameAndVersion);
} else if (service.equals(SERVICE_NAME_COUNTRY)) {
Map<String, PersonalNameGeoOut> countrieds = processCountryAdapted(new ArrayList(firstLastNamesIn.values()));
append(writer, outputHeaders, firstLastNamesIn, countrieds, softwareNameAndVersion);
}
firstLastNamesIn.clear();
}
Expand All @@ -699,6 +730,9 @@ private void processData(String service, String[] outputHeaders, Writer writer,
} else if (service.equals(SERVICE_NAME_GENDER)) {
Map<String, FirstLastNameGenderedOut> genders = processGenderFull(new ArrayList(personalNamesIn.values()));
append(writer, outputHeaders, personalNamesIn, genders, softwareNameAndVersion);
} else if (service.equals(SERVICE_NAME_COUNTRY)) {
Map<String, PersonalNameGeoOut> countrieds = processCountry(new ArrayList(personalNamesIn.values()));
append(writer, outputHeaders, personalNamesIn, countrieds, softwareNameAndVersion);
}
personalNamesIn.clear();
}
Expand Down Expand Up @@ -761,6 +795,10 @@ private void append(Writer writer, String[] outputHeaders, Map input, Map output
PersonalNameGenderedOut personalNameGenderedOut = (PersonalNameGenderedOut) outputObj;
String scriptName = NamSorTools.computeScriptFirst(personalNameGenderedOut.getName());
writer.append(personalNameGenderedOut.getLikelyGender().getValue() + separatorOut + personalNameGenderedOut.getScore() + separatorOut + personalNameGenderedOut.getGenderScale() + separatorOut + scriptName + separatorOut);
} else if (outputObj instanceof PersonalNameGeoOut) {
PersonalNameGeoOut personalNameGeoOut = (PersonalNameGeoOut) outputObj;
String scriptName = NamSorTools.computeScriptFirst(personalNameGeoOut.getName());
writer.append(personalNameGeoOut.getCountry() + separatorOut + personalNameGeoOut.getCountryAlt() + separatorOut + personalNameGeoOut.getProbabilityCalibrated() + separatorOut + personalNameGeoOut.getProbabilityAltCalibrated() + separatorOut +personalNameGeoOut.getScore() + separatorOut + scriptName + separatorOut);
} else if (outputObj instanceof PersonalNameParsedOut) {
PersonalNameParsedOut personalNameParsedOut = (PersonalNameParsedOut) outputObj;
// {"firstNameParsed", "lastNameParsed", "nameParserType", "nameParserTypeAlt", "nameParserTypeScore"};
Expand Down

0 comments on commit 2c56c21

Please sign in to comment.