Arbitrary text placed in an HTML tag often needs to be altered, to ensure that the resulting HTML remains valid.
Problem characters can include
These characters can be replaced with HTML character entities. For example,
< can be replaced with
<.
Query strings (Blah=1&Name=Bob) often need to be escaped as well. If the query string contains special characters, it will need to be "URL encoded". (See the javadoc for the URLEncoder class for further information.) This will ensure the query string conforms with valid HTTP.
There is often a second issue, however, with regard to query strings. If a query string is placed in an HREF attribute, then even a URL encoded query string is often not of valid form. This is because URLEncoder produces valid HTTP, but it does not in general produce text which is a valid HTML attribute - the ampersand character needs to be replaced by the corresponding character entity &.
Here is an example of a utility class which escapes special characters for HTML tags, URL fragments, and regular expressions.
package hirondelle.web4j.util;
import java.net.URLEncoder;
import java.io.UnsupportedEncodingException;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
/**
* Convenience methods for altering special characters related to URLs,
* regular expressions, and HTML tags.
*/
public final class EscapeChars {
/**
* Synonym for URLEncoder.encode(String, "UTF-8").
*
* Used to ensure that HTTP query strings are in proper form, by escaping
* special characters such as spaces.
*
*
An example use case for this method is a login scheme in which, after successful
* login, the user is redirected to the "original" target destination. Such a target
* might be passed around as a request parameter. Such a request parameter
* will have a URL as its value, as in "LoginTarget=Blah.jsp?this=that&blah=boo", and
* would need to be URL-encoded in order to escape its special characters.
*
*
It is important to note that if a query string appears in an HREF
* attribute, then there are two issues - ensuring the query string is valid HTTP
* (it is URL-encoded), and ensuring it is valid HTML (ensuring the ampersand is escaped).
*/
public static String forURL(String aURLFragment){
String result = null;
try {
result = URLEncoder.encode(aURLFragment, "UTF-8");
}
catch (UnsupportedEncodingException ex){
throw new RuntimeException("UTF-8 not supported", ex);
}
return result;
}
/**
* Replace characters having special meaning inside HTML tags
* with their escaped equivalents, using character entities such as '&'.
*
*
The escaped characters are :
*
* - <
*
- >
*
- "
*
- '
*
- \
*
- &
*
*
*
This method ensures that arbitrary text appearing inside a tag does not "confuse"
* the tag. For example, HREF='Blah.do?Page=1&Sort=ASC'
* does not comply with strict HTML because of the ampersand, and should be changed to
* HREF='Blah.do?Page=1&Sort=ASC'. This is commonly seen in building
* query strings. (In JSTL, the c:url tag performs this task automatically.)
*/
public static String forHTMLTag(String aTagFragment){
final StringBuffer result = new StringBuffer();
final StringCharacterIterator iterator = new StringCharacterIterator(aTagFragment);
char character = iterator.current();
while (character != CharacterIterator.DONE ){
if (character == '<') {
result.append("<");
}
else if (character == '>') {
result.append(">");
}
else if (character == '\"') {
result.append(""");
}
else if (character == '\'') {
result.append("'");
}
else if (character == '\\') {
result.append("\");
}
else if (character == '&') {
result.append("&");
}
else {
//the char is not a special one
//add it to the result as is
result.append(character);
}
character = iterator.next();
}
return result.toString();
}
/**
* Return aText with all start-of-tag and end-of-tag characters
* replaced by their escaped equivalents.
*
*
If user input may contain tags which must be disabled, then call
* this method, not {@link #forHTMLTag}. This method is used for text appearing
* outside of a tag, while {@link #forHTMLTag} is used for text appearing
* inside an HTML tag.
*
*
It is not uncommon to see text on a web page presented erroneously, because
* all special characters are escaped (as in {@link #forHTMLTag}). In
* particular, the ampersand character is often escaped not once but twice :
* once when the original input occurs, and then a second time when the same item is
* retrieved from the database. This occurs because the ampersand is the only escaped
* character which appears in a character entity.
*/
public static String toDisableTags(String aText){
final StringBuffer result = new StringBuffer();
final StringCharacterIterator iterator = new StringCharacterIterator(aText);
char character = iterator.current();
while (character != CharacterIterator.DONE ){
if (character == '<') {
result.append("<");
}
else if (character == '>') {
result.append(">");
}
else {
//the char is not a special one
//add it to the result as is
result.append(character);
}
character = iterator.next();
}
return result.toString();
}
/**
* Replace characters having special meaning in regular expressions
* with their escaped equivalents.
*
*
The escaped characters include :
*
*- .
*
- \
*
- ?, * , and +
*
- &
*
- :
*
- { and }
*
- [ and ]
*
- ( and )
*
- ^ and $
*
*
*/
public static String forRegex(String aRegexFragment){
final StringBuffer result = new StringBuffer();
final StringCharacterIterator iterator = new StringCharacterIterator(aRegexFragment);
char character = iterator.current();
while (character != CharacterIterator.DONE ){
/*
* All literals need to have backslashes doubled.
*/
if (character == '.') {
result.append("\\.");
}
else if (character == '\\') {
result.append("\\\\");
}
else if (character == '?') {
result.append("\\?");
}
else if (character == '*') {
result.append("\\*");
}
else if (character == '+') {
result.append("\\+");
}
else if (character == '&') {
result.append("\\&");
}
else if (character == ':') {
result.append("\\:");
}
else if (character == '{') {
result.append("\\{");
}
else if (character == '}') {
result.append("\\}");
}
else if (character == '[') {
result.append("\\[");
}
else if (character == ']') {
result.append("\\]");
}
else if (character == '(') {
result.append("\\(");
}
else if (character == ')') {
result.append("\\)");
}
else if (character == '^') {
result.append("\\^");
}
else if (character == '$') {
result.append("\\$");
}
else {
//the char is not a special one
//add it to the result as is
result.append(character);
}
character = iterator.next();
}
return result.toString();
}
// PRIVATE //
private EscapeChars(){
//empty - prevent construction
}
}