|
This was based on an old project that I resurrected, improved, and repackaged with claude code. It's useful for estimating demographics in large datasets from limited information, i.e. just a name. It's also fairly good at separating given name and family names across a wide variety of languages and contexts. Contains a standalone binary, embeddable shared lib, and a python wrapper. Examples CLI: user@box » ./build/name-classifier -j -c "Carlos Eduardo Fernando Salazar Montemayor" | jq . {
"input": "Carlos Eduardo Fernando Salazar Montemayor",
"script": "latin",
"components": [
{
"token": "Carlos",
"role": "given",
"index": 0,
"surname_score": 0.009
},
{
"token": "Eduardo",
"role": "given",
"index": 1,
"surname_score": 0.001
},
{
"token": "Fernando",
"role": "given",
"index": 2,
"surname_score": 0.01
},
{
"token": "Salazar",
"role": "family",
"index": 3,
"surname_score": 0.998
},
{
"token": "Montemayor",
"role": "family",
"index": 4,
"surname_score": 0.975
}
],
"attributes": {
"gender": {
"male": 0.9938,
"female": 0.0062,
"neutral": 0
},
"origin": {
"english": 0,
"french": 0,
"germanic": 0,
"nordic": 0,
"iberian": 1,
"italian": 0,
"eastern_european": 0,
"arabic": 0,
"east_asian": 0,
"south_asian": 0,
"southeast_asian": 0
}
},
"calibrated": true,
"model_version": "embedded",
"provenance": {
"gender": {
"lexicon": 0.598,
"ngram": 0.302,
"neural": 0.101
},
"origin": {
"lexicon": 0,
"ngram": 0,
"neural": 0
}
}
} Python: from name_classifier import NameClassifier nc = NameClassifier(args.model_dir) nc.classify("Kateryna Olha Mykhailivna Shevchenko") |