Skip to content

Commit

Permalink
Merge pull request #324 from maxmind/horgh/email-normalize
Browse files Browse the repository at this point in the history
Add additional email normalization
  • Loading branch information
ugexe authored Mar 7, 2024
2 parents 408ecea + 804e9c1 commit dc563a2
Show file tree
Hide file tree
Showing 6 changed files with 311 additions and 32 deletions.
25 changes: 25 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,31 @@ CHANGELOG
------------------

* Added `PXP_FINANCIAL` and `TRUSTPAY` to the `Payment.Processor` enum.
* Equivalent domain names are now normalized when `hashAddress` is used.
For example, `googlemail.com` will become `gmail.com`.
* Periods are now removed from `gmail.com` email address local parts when
`hashAddress` is used. For example, `[email protected]` will become
`[email protected]`.
* Fastmail alias subdomain email addresses are now normalized when
`hashAddress` is used. For example, `[email protected]` will become
`[email protected]`.
* Additional `yahoo.com` email addresses now have aliases removed from
their local part when `hashAddress` is used. For example,
`[email protected]` will become `[email protected]` for additional
`yahoo.com` domains.
* Duplicate `.com`s are now removed from email domain names when
`hashAddress` is used. For example, `example.com.com` will become
`example.com`.
* Extraneous characters after `.com` are now removed from email domain
names when `hashAddress` is used. For example, `example.comfoo` will
become `example.com`.
* Certain `.com` typos are now normalized to `.com` when `hashAddress` is
used. For example, `example.cam` will become `example.com`.
* Additional `gmail.com` domain names with leading digits are now
normalized when `hashAddress` is used. For example, `100gmail.com` will
become `gmail.com`.
* Additional `gmail.com` typos are now normalized when `hashAddress` is
used. For example, `gmali.com` will become `gmail.com`.

3.3.0 (2023-12-05)
------------------
Expand Down
9 changes: 0 additions & 9 deletions checkstyle-suppressions.xml

This file was deleted.

6 changes: 0 additions & 6 deletions checkstyle.xml
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,6 @@
<module name="BeforeExecutionExclusionFileFilter">
<property name="fileNamePattern" value="module\-info\.java$"/>
</module>
<!-- https://checkstyle.org/config_filters.html#SuppressionFilter -->
<module name="SuppressionFilter">
<property name="file" value="${org.checkstyle.google.suppressionfilter.config}"
default="checkstyle-suppressions.xml"/>
<property name="optional" value="true"/>
</module>

<!-- Checks for whitespace -->
<!-- See http://checkstyle.org/config_whitespace.html -->
Expand Down
1 change: 0 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@
<configuration>
<consoleOutput>true</consoleOutput>
<configLocation>checkstyle.xml</configLocation>
<suppressionsLocation>checkstyle-suppressions.xml</suppressionsLocation>
<violationSeverity>warning</violationSeverity>
</configuration>
<dependencies>
Expand Down
253 changes: 238 additions & 15 deletions src/main/java/com/maxmind/minfraud/request/Email.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
Expand All @@ -21,22 +22,217 @@ public final class Email extends AbstractModel {
private final boolean hashAddress;
private final String domain;
private static final Map<String, String> typoDomains;
private static final Map<String, String> equivalentDomains;
private static final Map<String, Boolean> fastmailDomains;
private static final Map<String, Boolean> yahooDomains;

static {
HashMap<String, String> m = new HashMap<>() {{
// gmail.com
put("35gmai.com", "gmail.com");
put("636gmail.com", "gmail.com");
put("gamil.com", "gmail.com");
put("gmail.comu", "gmail.com");
put("gmial.com", "gmail.com");
put("gmil.com", "gmail.com");
put("yahoogmail.com", "gmail.com");
// outlook.com
put("putlook.com", "outlook.com");
}};

typoDomains = Collections.unmodifiableMap(m);
HashMap<String, String> typoDomainsMap = new HashMap<>() {{
// gmail.com
put("gmai.com", "gmail.com");
put("gamil.com", "gmail.com");
put("gmali.com", "gmail.com");
put("gmial.com", "gmail.com");
put("gmil.com", "gmail.com");
put("gmaill.com", "gmail.com");
put("gmailm.com", "gmail.com");
put("gmailo.com", "gmail.com");
put("gmailyhoo.com", "gmail.com");
put("yahoogmail.com", "gmail.com");
// outlook.com
put("putlook.com", "outlook.com");
}};
typoDomains = Collections.unmodifiableMap(typoDomainsMap);

HashMap<String, String> equivalentDomainsMap = new HashMap<>() {{
put("googlemail.com", "gmail.com");
put("pm.me", "protonmail.com");
put("proton.me", "protonmail.com");
put("yandex.by", "yandex.ru");
put("yandex.com", "yandex.ru");
put("yandex.kz", "yandex.ru");
put("yandex.ua", "yandex.ru");
put("ya.ru", "yandex.ru");
}};
equivalentDomains = Collections.unmodifiableMap(equivalentDomainsMap);

HashMap<String, Boolean> fastmailDomainsMap = new HashMap<>() {{
put("123mail.org", true);
put("150mail.com", true);
put("150ml.com", true);
put("16mail.com", true);
put("2-mail.com", true);
put("4email.net", true);
put("50mail.com", true);
put("airpost.net", true);
put("allmail.net", true);
put("bestmail.us", true);
put("cluemail.com", true);
put("elitemail.org", true);
put("emailcorner.net", true);
put("emailengine.net", true);
put("emailengine.org", true);
put("emailgroups.net", true);
put("emailplus.org", true);
put("emailuser.net", true);
put("eml.cc", true);
put("f-m.fm", true);
put("fast-email.com", true);
put("fast-mail.org", true);
put("fastem.com", true);
put("fastemail.us", true);
put("fastemailer.com", true);
put("fastest.cc", true);
put("fastimap.com", true);
put("fastmail.cn", true);
put("fastmail.co.uk", true);
put("fastmail.com", true);
put("fastmail.com.au", true);
put("fastmail.de", true);
put("fastmail.es", true);
put("fastmail.fm", true);
put("fastmail.fr", true);
put("fastmail.im", true);
put("fastmail.in", true);
put("fastmail.jp", true);
put("fastmail.mx", true);
put("fastmail.net", true);
put("fastmail.nl", true);
put("fastmail.org", true);
put("fastmail.se", true);
put("fastmail.to", true);
put("fastmail.tw", true);
put("fastmail.uk", true);
put("fastmail.us", true);
put("fastmailbox.net", true);
put("fastmessaging.com", true);
put("fea.st", true);
put("fmail.co.uk", true);
put("fmailbox.com", true);
put("fmgirl.com", true);
put("fmguy.com", true);
put("ftml.net", true);
put("h-mail.us", true);
put("hailmail.net", true);
put("imap-mail.com", true);
put("imap.cc", true);
put("imapmail.org", true);
put("inoutbox.com", true);
put("internet-e-mail.com", true);
put("internet-mail.org", true);
put("internetemails.net", true);
put("internetmailing.net", true);
put("jetemail.net", true);
put("justemail.net", true);
put("letterboxes.org", true);
put("mail-central.com", true);
put("mail-page.com", true);
put("mailandftp.com", true);
put("mailas.com", true);
put("mailbolt.com", true);
put("mailc.net", true);
put("mailcan.com", true);
put("mailforce.net", true);
put("mailftp.com", true);
put("mailhaven.com", true);
put("mailingaddress.org", true);
put("mailite.com", true);
put("mailmight.com", true);
put("mailnew.com", true);
put("mailsent.net", true);
put("mailservice.ms", true);
put("mailup.net", true);
put("mailworks.org", true);
put("ml1.net", true);
put("mm.st", true);
put("myfastmail.com", true);
put("mymacmail.com", true);
put("nospammail.net", true);
put("ownmail.net", true);
put("petml.com", true);
put("postinbox.com", true);
put("postpro.net", true);
put("proinbox.com", true);
put("promessage.com", true);
put("realemail.net", true);
put("reallyfast.biz", true);
put("reallyfast.info", true);
put("rushpost.com", true);
put("sent.as", true);
put("sent.at", true);
put("sent.com", true);
put("speedpost.net", true);
put("speedymail.org", true);
put("ssl-mail.com", true);
put("swift-mail.com", true);
put("the-fastest.net", true);
put("the-quickest.com", true);
put("theinternetemail.com", true);
put("veryfast.biz", true);
put("veryspeedy.net", true);
put("warpmail.net", true);
put("xsmail.com", true);
put("yepmail.net", true);
put("your-mail.com", true);
}};
fastmailDomains = Collections.unmodifiableMap(fastmailDomainsMap);

HashMap<String, Boolean> yahooDomainsMap = new HashMap<>() {{
put("y7mail.com", true);
put("yahoo.at", true);
put("yahoo.be", true);
put("yahoo.bg", true);
put("yahoo.ca", true);
put("yahoo.cl", true);
put("yahoo.co.id", true);
put("yahoo.co.il", true);
put("yahoo.co.in", true);
put("yahoo.co.kr", true);
put("yahoo.co.nz", true);
put("yahoo.co.th", true);
put("yahoo.co.uk", true);
put("yahoo.co.za", true);
put("yahoo.com", true);
put("yahoo.com.ar", true);
put("yahoo.com.au", true);
put("yahoo.com.br", true);
put("yahoo.com.co", true);
put("yahoo.com.hk", true);
put("yahoo.com.hr", true);
put("yahoo.com.mx", true);
put("yahoo.com.my", true);
put("yahoo.com.pe", true);
put("yahoo.com.ph", true);
put("yahoo.com.sg", true);
put("yahoo.com.tr", true);
put("yahoo.com.tw", true);
put("yahoo.com.ua", true);
put("yahoo.com.ve", true);
put("yahoo.com.vn", true);
put("yahoo.cz", true);
put("yahoo.de", true);
put("yahoo.dk", true);
put("yahoo.ee", true);
put("yahoo.es", true);
put("yahoo.fi", true);
put("yahoo.fr", true);
put("yahoo.gr", true);
put("yahoo.hu", true);
put("yahoo.ie", true);
put("yahoo.in", true);
put("yahoo.it", true);
put("yahoo.lt", true);
put("yahoo.lv", true);
put("yahoo.nl", true);
put("yahoo.no", true);
put("yahoo.pl", true);
put("yahoo.pt", true);
put("yahoo.ro", true);
put("yahoo.se", true);
put("yahoo.sk", true);
put("ymail.com", true);
}};
yahooDomains = Collections.unmodifiableMap(yahooDomainsMap);
}

private Email(Email.Builder builder) {
Expand Down Expand Up @@ -174,7 +370,7 @@ private String cleanAddress(String address) {
domain = cleanDomain(domain);

int stopChar;
if (domain.equals("yahoo.com")) {
if (yahooDomains.containsKey(domain)) {
stopChar = '-';
} else {
stopChar = '+';
Expand All @@ -184,6 +380,24 @@ private String cleanAddress(String address) {
localPart = localPart.substring(0, stopCharIndex);
}

if (domain.equals("gmail.com")) {
localPart = localPart.replace(".", "");
}

String[] domainParts = domain.split("\\.");
if (domainParts.length > 2) {
String possibleDomain = String.join(
".",
Arrays.copyOfRange(domainParts, 1, domainParts.length)
);
if (fastmailDomains.containsKey(possibleDomain)) {
domain = possibleDomain;
if (!localPart.equals("")) {
localPart = domainParts[0];
}
}
}

return localPart + "@" + domain;
}

Expand All @@ -200,10 +414,19 @@ private String cleanDomain(String domain) {

domain = IDN.toASCII(domain);

domain = domain.replaceAll("(?:\\.com){2,}$", ".com");
domain = domain.replaceAll("\\.com[^.]+$", ".com");
domain = domain.replaceAll("(?:\\.(?:com|c[a-z]{1,2}m|co[ln]|[dsvx]o[mn]|))$", ".com");
domain = domain.replaceAll("^\\d+(?:gmail?\\.com)$", "gmail.com");

if (typoDomains.containsKey(domain)) {
domain = typoDomains.get(domain);
}

if (equivalentDomains.containsKey(domain)) {
domain = equivalentDomains.get(domain);
}

return domain;
}

Expand Down
Loading

0 comments on commit dc563a2

Please sign in to comment.