tags:

views:

586

answers:

4

I have a String such as:

Cerepedia, una apliación web

I would like to transform it into something URL valid such as:

Cerepedia,unaaplicacionweb

Note: the special character transformation and spaces removal.

By the way, are commas allowed in URLs?

+2  A: 

Have you looked at URLEncoder? That seems to do what you need it to do. Though the special characters will be transformed to escaped entities and not stripped from their "special" properties.

extraneon
+1  A: 

Try convertNonAscii() in the class below

public class AsciiUtils {

    /**
     * Contains a list of all the characters that map one to one for UNICODE.
     */
    private static final String PLAIN_ASCII = 
           "AaEeIiOoUu"    // grave
         + "AaEeIiOoUuYy"  // acute
         + "AaEeIiOoUuYy"  // circumflex
         + "AaEeIiOoUuYy"  // tilde
         + "AaEeIiOoUuYy"  // umlaut
         + "Aa"            // ring
         + "Cc"            // cedilla
         + "Nn"     // n tilde (spanish)
         ;

    /**
     * Actual accented values, corresponds one to one with ASCII
     */
    private static final String UNICODE =
      "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9"             
     +"\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" 
     +"\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" 
     +"\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" 
     +"\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" 
     +"\u00C5\u00E5"                                                             
     +"\u00C7\u00E7"  
     +"\u00D1\u00F1"
     ;

    // private constructor, can't be instanciated!
    private AsciiUtils() {  
    }


    /**
     * Removes accentued from a string and replace with ascii equivalent
     * @param s The string to englishify
     * @return The string without the french and spanish stuff.
     */
    public static String convertNonAscii(String s) {

        StringBuilder b = new StringBuilder();

        int n = s.length();
        for (int i = 0; i < n; i++) {
         char c = s.charAt(i);
         int pos = UNICODE.indexOf(c);
         if (pos > -1) {
              b.append(PLAIN_ASCII.charAt(pos));
         } else {
              b.append(c);
         }
        }

       return b.toString();

    }

}
Don
It is working if the string is in source code, However, it is not working if the string is retrieved from a UTF-8 encoded file.
Sergio del Amo
A: 

URLEncoder subsitutes spaces with +. The Asccii class posted by Don does not remove spaces but the next function can be used for that propouse:

public static String removeSpaces(String s) {
    StringTokenizer st = new StringTokenizer(s," ",false);
    String t="";
    while (st.hasMoreElements()) t += st.nextElement();
        return t;
}
Sergio del Amo
A: 

Note Don solution works with strings in code but does not work with strings coming from a file with UTF-8 encoding

This is the best solution i have, using URLEncode and escaping the hexadecimal characters afterwards:

String s = "Cerepedia, una apliación web";
String ENCODING= "uft-8";
String encoded_s = URLEncoder.encode(s,ENCODING); // Cerepedia+una+aplicaci%C3%83%C2%B3n+web
String s_hexa_free = EncodingTableUtils.replaceHexa(,ENCODING)); //  Cerepedia+una+aplicacion+web

EncodingTableUtils

import java.util.HashMap;
import java.util.Iterator;
import java.util.Set;

public class EncodingTableUtils {
    public final static HashMap iso88591 = new HashMap();
    static {
     iso88591.put("%C3%A1", "a"); // á
     iso88591.put("%C3%81", "A"); // Á
     iso88591.put("%C3%A9", "e"); // é
     iso88591.put("%C3%89", "E"); // É
     iso88591.put("%C3%AD", "i"); // í
     iso88591.put("%C3%8D", "I"); // Í
     iso88591.put("%C3%93", "O"); // Ó
     iso88591.put("%C3%B3", "o"); // ó
     iso88591.put("%C3%BA", "u"); // ú
     iso88591.put("%C3%9A", "U"); // Ú
     iso88591.put("%C3%91", "N"); // Ñ
     iso88591.put("%C3%B1", "n"); // ñ
    }
    public final static HashMap utf8 = new HashMap();
    static {
     utf8.put("%C3%83%C2%A1", "a"); // á
     utf8.put("%C3%83%EF%BF", "A"); // Á
     utf8.put("%BD%C3%83%C2", "e"); // é
     utf8.put("%A9%C3%83%E2", "E"); // É
     utf8.put("%80%B0%C3%83", "i"); // í
     utf8.put("%C2%AD%C3%83", "I"); // Í
     utf8.put("%EF%BF%BD%C3", "O"); // Ó
     utf8.put("%C3%83%C2%B3", "o"); // ó
     utf8.put("%83%E2%80%9C", "u"); // ú  
     utf8.put("%C3%83%C2%BA", "U"); // Ú
     utf8.put("%C3%83%C5%A1", "N"); // Ñ
     utf8.put("%C3%83%E2%80", "n"); // ñ
    }

    public final static HashMap enc_table = new HashMap();
    static {
     enc_table.put("iso-8859-1", iso88591);
     enc_table.put("utf-8", utf8);
    }


    /**
     * Replace Hexadecimal characters with equivalent english not special ones
     * <p>Example: á Hexa: %C3%A1 gets replaced with a</p>
     * @param s Usually a string coming from URLEncode.encode
     * @param enc Encoding UTF-8 or ISO-8850-1
     */
    public static String convertHexaDecimal(String s, String enc) {
     HashMap characters = (HashMap) enc_table.get(enc.toLowerCase());
     if(characters==null) return "";
     Set keys = characters.keySet();
     Iterator it = keys.iterator();
     while(it.hasNext()) {
      String key = (String) it.next();
      String regex = EscapeChars.forRegex(key);
      String replacement = (String) characters.get(key); 
      s = s.replaceAll(regex, replacement);   
     }
     return s;
    }
}

EscapeChars Class

public final class EscapeChars {
/**
  * Replace characters having special meaning in regular expressions
  * with their escaped equivalents, preceded by a '\' character.
  *
  * <P>The escaped characters include :
  *<ul>
  *<li>.
  *<li>\
  *<li>?, * , and +
  *<li>&
  *<li>:
  *<li>{ and }
  *<li>[ and ]
  *<li>( and )
  *<li>^ and $
  *</ul>
  */
  public static String forRegex(String aRegexFragment){
    final StringBuilder result = new StringBuilder();

    final StringCharacterIterator iterator = new StringCharacterIterator(aRegexFragment);
    char character =  iterator.current();
    while (character != CharacterIterator.DONE ){
      /*
      * All literals need to have backslashes doubled.
      */
      if (character == '.') {
        result.append("\\.");
      }
      else if (character == '\\') {
        result.append("\\\\");
      }
      else if (character == '?') {
        result.append("\\?");
      }
      else if (character == '*') {
        result.append("\\*");
      }
      else if (character == '+') {
        result.append("\\+");
      }
      else if (character == '&') {
        result.append("\\&");
      }
      else if (character == ':') {
        result.append("\\:");
      }
      else if (character == '{') {
        result.append("\\{");
      }
      else if (character == '}') {
        result.append("\\}");
      }
      else if (character == '[') {
        result.append("\\[");
      }
      else if (character == ']') {
        result.append("\\]");
      }
      else if (character == '(') {
        result.append("\\(");
      }
      else if (character == ')') {
        result.append("\\)");
      }
      else if (character == '^') {
        result.append("\\^");
      }
      else if (character == '$') {
        result.append("\\$");
      }
      else {
        //the char is not a special one
        //add it to the result as is
        result.append(character);
      }
      character = iterator.next();
    }
    return result.toString();
  }
}
Sergio del Amo