I don't know of any standard way for this, I've been using a similair solution as what you are refering to. Not sure which one's better, so here you have it:
public class TextUtils {
private static final Pattern DIACRITICS_AND_FRIENDS =
Pattern.compile("[\\p{InCombiningDiacriticalMarks}\\p{IsLm}\\p{IsSk}]+");
private static final Transliterator TO_LATIN_TRANSLITERATOR = Transliterator.getInstance("Any-Latin");
private static final Pattern EEQUIVALENTS = Pattern.compile("[ǝƏ]+");
private static final Pattern IEQUIVALENTS = Pattern.compile("[ı]+");
private static final Pattern DEQUIVALENTS = Pattern.compile("[Ððđ]+");
private static final Pattern OEQUIVALENTS = Pattern.compile("[Øø]+");
private static final Pattern LEQUIVALENTS = Pattern.compile("[Ł]+");
//all spaces, non-ascii and punctuation characters except _ and -
private static final Pattern CRAP = Pattern.compile("[\\p{IsSpace}\\P{IsASCII}\\p{IsP}\\+&&[^_]]");
private static final Pattern SEPARATORS = Pattern.compile("[\\p{IsSpace}/`-]");
private static final Pattern URLFRIENDLY = Pattern.compile("([a-zA-Z0-9_])*");
private static final CharsetEncoder ASCII_ENCODER = Charset.forName("ISO-8859-1").newEncoder();
/**
* Returns true when the input test contains only characters from the ASCII set, false otherwise.
*/
public static boolean isPureAscii(String text) {
return ASCII_ENCODER.canEncode(text);
}
/**
* Replaces all characters that normalize into two characters with their base symbol (e.g. ü -> u)
*/
public static String replaceCombiningDiacriticalMarks(String text) {
return DIACRITICS_AND_FRIENDS.matcher(Normalizer.normalize(text, Normalizer.Form.NFKD)).replaceAll("");
}
/**
* Turns the input string into a url friendly variant (containing only alphanumeric characters and '-' and '_').
* If the input string cannot be converted an IllegalArgumentException is thrown.
*/
public static String urlFriendlyStrict(String unfriendlyString) throws IllegalArgumentException {
String friendlyString =
urlFriendly(unfriendlyString);
//Assert can be removed to improve performance
Assert.isTrue(URLFRIENDLY.matcher(friendlyString).matches(),
format("Friendly string [%s] based on [%s] is not friendly enough", friendlyString, unfriendlyString));
return friendlyString;
}
/**
* Turns the input string into a url friendly variant (containing only alphanumeric characters and '-' and '_').
* Use {@link #urlFriendlyStrict(String)} to avoid potential bugs in this code.
*/
private static String urlFriendly(String unfriendlyString) {
return removeCrappyCharacters(
replaceEquivalentsOfSymbols(
replaceCombiningDiacriticalMarks(
transLiterateSymbols(
replaceSeparatorsWithUnderscores(
unfriendlyString.trim()))))).toLowerCase();
}
private static String transLiterateSymbols(String incomprehensibleString) {
String latin = TO_LATIN_TRANSLITERATOR.transform(incomprehensibleString);
return latin;
}
private static String replaceEquivalentsOfSymbols(String unfriendlyString) {
return
LEQUIVALENTS.matcher(
OEQUIVALENTS.matcher(
DEQUIVALENTS.matcher(
IEQUIVALENTS.matcher(
EEQUIVALENTS.matcher(unfriendlyString).replaceAll("e"))
.replaceAll("i"))
.replaceAll("d"))
.replaceAll("o"))
.replaceAll("l");
}
private static String removeCrappyCharacters(String unfriendlyString) {
return CRAP.matcher(unfriendlyString).replaceAll("");
}
private static String replaceSeparatorsWithUnderscores(String unfriendlyString) {
return SEPARATORS.matcher(unfriendlyString).replaceAll("_");
}
}