package de.brightbyte.xml;

import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import de.brightbyte.util.StringUtils;

public class HtmlEntities {

	public static final Map<String, String> entities;
	
	static {
		HashMap<String, String> ent = new HashMap<String, String>();
		ent.put("quot", "\"");
		ent.put("apos", "'");
		ent.put("amp", "&");
		ent.put("lt", "<");
		ent.put("gt", ">");
		ent.put("nbsp", "\u00A0");
		ent.put("iexcl", "\u00A1"); // inverted exclamation mark
		ent.put("cent", "\u00A2"); // cent sign
		ent.put("pound", "\u00A3"); // pound ent.put("sign", "\u00A4"); // currency sign
		ent.put("yen", "\u00A5"); // yen sign
		ent.put("brvbar", "\u00A6"); // broken bar
		ent.put("sect", "\u00A7"); // section sign
		ent.put("uml", "\u00A8"); // diaeresis
		ent.put("copy", "\u00A9"); // copyright sign
		ent.put("ordf", "\u00AA"); // feminine ordinal indicator
		ent.put("laquo", "\u00AB"); // left-pointing double angle quotation mark
		ent.put("not", "\u00AC"); // not sign
		ent.put("shy", "\u00AD"); // soft hyphen
		ent.put("reg", "\u00AE"); // registered sign
		ent.put("macr", "\u00AF"); // macron
		ent.put("deg", "\u00B0"); // degree sign
		ent.put("plusmn", "\u00B1"); // plus-minus sign
		ent.put("sup2", "\u00B2"); // superscript two
		ent.put("sup3", "\u00B3"); // superscript three
		ent.put("acute", "\u00B4"); // acute accent
		ent.put("micro", "\u00B5"); // micro sign
		ent.put("para", "\u00B6"); // pilcrow sign
		ent.put("middot", "\u00B7"); // middle dot
		ent.put("cedil", "\u00B8"); // cedilla
		ent.put("sup1", "\u00B9"); // superscript one
		ent.put("ordm", "\u00BA"); // masculine ordinal indicator
		ent.put("raquo", "\u00BB"); // right-pointing double angle quotation mark
		ent.put("frac14", "\u00BC"); // vulgar fraction one quarter
		ent.put("frac12", "\u00BD"); // vulgar fraction one half
		ent.put("frac34", "\u00BE"); // vulgar fraction three quarters
		ent.put("iquest", "\u00BF"); // inverted question mark
		ent.put("Agrave", "\u00C0"); // Latin capital letter a with grave
		ent.put("Aacute", "\u00C1"); // Latin capital letter a with acute
		ent.put("Acirc", "\u00C2"); // Latin capital letter a with circumflex
		ent.put("Atilde", "\u00C3"); // Latin capital letter a with tilde
		ent.put("Auml", "\u00C4"); // Latin capital letter a with diaeresis
		ent.put("Aring", "\u00C5"); // Latin capital letter a with ring above
		ent.put("AElig", "\u00C6"); // Latin capital letter ae
		ent.put("Ccedil", "\u00C7"); // Latin capital letter c with cedilla
		ent.put("Egrave", "\u00C8"); // Latin capital letter e with grave
		ent.put("Eacute", "\u00C9"); // Latin capital letter e with acute
		ent.put("Ecirc", "\u00CA"); // Latin capital letter e with circumflex
		ent.put("Euml", "\u00CB"); // Latin capital letter e with diaeresis
		ent.put("Igrave", "\u00CC"); // Latin capital letter i with grave
		ent.put("Iacute", "\u00CD"); // Latin capital letter i with acute
		ent.put("Icirc", "\u00CE"); // Latin capital letter i with circumflex
		ent.put("Iuml", "\u00CF"); // Latin capital letter i with diaeresis
		ent.put("ETH", "\u00D0"); // Latin capital letter eth
		ent.put("Ntilde", "\u00D1"); // Latin capital letter n with tilde
		ent.put("Ograve", "\u00D2"); // Latin capital letter o with grave
		ent.put("Oacute", "\u00D3"); // Latin capital letter o with acute
		ent.put("Ocirc", "\u00D4"); // Latin capital letter o with circumflex
		ent.put("Otilde", "\u00D5"); // Latin capital letter o with tilde
		ent.put("Ouml", "\u00D6"); // Latin capital letter o with diaeresis
		ent.put("times", "\u00D7"); // multiplication sign
		ent.put("Oslash", "\u00D8"); // Latin capital letter o with stroke
		ent.put("Ugrave", "\u00D9"); // Latin capital letter u with grave
		ent.put("Uacute", "\u00DA"); // Latin capital letter u with acute
		ent.put("Ucirc", "\u00DB"); // Latin capital letter u with circumflex
		ent.put("Uuml", "\u00DC"); // Latin capital letter u with diaeresis
		ent.put("Yacute", "\u00DD"); // Latin capital letter y with acute
		ent.put("THORN", "\u00DE"); // Latin capital letter thorn
		ent.put("szlig", "\u00DF"); // Latin small letter sharp s (German Eszett)
		ent.put("agrave", "\u00E0"); // Latin small letter a with grave
		ent.put("aacute", "\u00E1"); // Latin small letter a with acute
		ent.put("acirc", "\u00E2"); // Latin small letter a with circumflex
		ent.put("atilde", "\u00E3"); // Latin small letter a with tilde
		ent.put("auml", "\u00E4"); // Latin small letter a with diaeresis
		ent.put("aring", "\u00E5"); // Latin small letter a with ring above
		ent.put("aelig", "\u00E6"); // Latin lowercase ligature ae
		ent.put("ccedil", "\u00E7"); // Latin small letter c with cedilla
		ent.put("egrave", "\u00E8"); // Latin small letter e with grave
		ent.put("eacute", "\u00E9"); // Latin small letter e with acute
		ent.put("ecirc", "\u00EA"); // Latin small letter e with circumflex
		ent.put("euml", "\u00EB"); // Latin small letter e with diaeresis
		ent.put("igrave", "\u00EC"); // Latin small letter i with grave
		ent.put("iacute", "\u00ED"); // Latin small letter i with acute
		ent.put("icirc", "\u00EE"); // Latin small letter i with circumflex
		ent.put("iuml", "\u00EF"); // Latin small letter i with diaeresis
		ent.put("eth", "\u00F0"); // Latin small letter eth
		ent.put("ntilde", "\u00F1"); // Latin small letter n with tilde
		ent.put("ograve", "\u00F2"); // Latin small letter o with grave
		ent.put("oacute", "\u00F3"); // Latin small letter o with acute
		ent.put("ocirc", "\u00F4"); // Latin small letter o with circumflex
		ent.put("otilde", "\u00F5"); // Latin small letter o with tilde
		ent.put("ouml", "\u00F6"); // Latin small letter o with diaeresis
		ent.put("divide", "\u00F7"); // division sign
		ent.put("oslash", "\u00F8"); // Latin small letter o with stroke
		ent.put("ugrave", "\u00F9"); // Latin small letter u with grave
		ent.put("uacute", "\u00FA"); // Latin small letter u with acute
		ent.put("ucirc", "\u00FB"); // Latin small letter u with circumflex
		ent.put("uuml", "\u00FC"); // Latin small letter u with diaeresis
		ent.put("yacute", "\u00FD"); // Latin small letter y with acute
		ent.put("thorn", "\u00FE"); // Latin small letter thorn
		ent.put("yuml", "\u00FF"); // Latin small letter y with diaeresis
		ent.put("OElig", "\u0152"); // Latin capital ligature oe
		ent.put("oelig", "\u0153"); // Latin small ligature oe
		ent.put("Scaron", "\u0160"); // Latin capital letter s with caron
		ent.put("scaron", "\u0161"); // Latin small letter s with caron
		ent.put("Yuml", "\u0178"); // Latin capital letter y with diaeresis
		ent.put("fnof", "\u0192"); // Latin small letter f with hook
		ent.put("circ", "\u02C6"); // modifier letter circumflex accent
		ent.put("tilde", "\u02DC"); // small tilde
		ent.put("Alpha", "\u0391"); // Greek capital letter alpha
		ent.put("Beta", "\u0392"); // Greek capital letter beta
		ent.put("Gamma", "\u0393"); // Greek capital letter gamma
		ent.put("Delta", "\u0394"); // Greek capital letter delta
		ent.put("Epsilon", "\u0395"); // Greek capital letter epsilon
		ent.put("Zeta", "\u0396"); // Greek capital letter zeta
		ent.put("Eta", "\u0397"); // Greek capital letter eta
		ent.put("Theta", "\u0398"); // Greek capital letter theta
		ent.put("Iota", "\u0399"); // Greek capital letter iota
		ent.put("Kappa", "\u039A"); // Greek capital letter kappa
		ent.put("Lambda", "\u039B"); // Greek capital letter lambda
		ent.put("Mu", "\u039C"); // Greek capital letter mu
		ent.put("Nu", "\u039D"); // Greek capital letter nu
		ent.put("Xi", "\u039E"); // Greek capital letter xi
		ent.put("Omicron", "\u039F"); // Greek capital letter omicron
		ent.put("Pi", "\u03A0"); // Greek capital letter pi
		ent.put("Rho", "\u03A1"); // Greek capital letter rho
		ent.put("Sigma", "\u03A3"); // Greek capital letter sigma
		ent.put("Tau", "\u03A4"); // Greek capital letter tau
		ent.put("Upsilon", "\u03A5"); // Greek capital letter upsilon
		ent.put("Phi", "\u03A6"); // Greek capital letter phi
		ent.put("Chi", "\u03A7"); // Greek capital letter chi
		ent.put("Psi", "\u03A8"); // Greek capital letter psi
		ent.put("Omega", "\u03A9"); // Greek capital letter omega
		ent.put("alpha", "\u03B1"); // Greek small letter alpha
		ent.put("beta", "\u03B2"); // Greek small letter beta
		ent.put("gamma", "\u03B3"); // Greek small letter gamma
		ent.put("delta", "\u03B4"); // Greek small letter delta
		ent.put("epsilon", "\u03B5"); // Greek small letter epsilon
		ent.put("zeta", "\u03B6"); // Greek small letter zeta
		ent.put("eta", "\u03B7"); // Greek small letter eta
		ent.put("theta", "\u03B8"); // Greek small letter theta
		ent.put("iota", "\u03B9"); // Greek small letter iota
		ent.put("kappa", "\u03BA"); // Greek small letter kappa
		ent.put("lambda", "\u03BB"); // Greek small letter lambda
		ent.put("mu", "\u03BC"); // Greek small letter mu
		ent.put("nu", "\u03BD"); // Greek small letter nu
		ent.put("xi", "\u03BE"); // Greek small letter xi
		ent.put("omicron", "\u03BF"); // Greek small letter omicron
		ent.put("pi", "\u03C0"); // Greek small letter pi
		ent.put("rho", "\u03C1"); // Greek small letter rho
		ent.put("sigmaf", "\u03C2"); // Greek small letter final sigma
		ent.put("sigma", "\u03C3"); // Greek small letter sigma
		ent.put("tau", "\u03C4"); // Greek small letter tau
		ent.put("upsilon", "\u03C5"); // Greek small letter upsilon
		ent.put("phi", "\u03C6"); // Greek small letter phi
		ent.put("chi", "\u03C7"); // Greek small letter chi
		ent.put("psi", "\u03C8"); // Greek small letter psi
		ent.put("omega", "\u03C9"); // Greek small letter omega
		ent.put("thetasym", "\u03D1"); // Greek theta symbol
		ent.put("upsih", "\u03D2"); // Greek upsilon with hook symbol
		ent.put("piv", "\u03D6"); // Greek pi symbol
		ent.put("ensp", "\u2002"); // en space [1]
		ent.put("emsp", "\u2003"); // em space [2]
		ent.put("thinsp", "\u2009"); // thin space [3]
		ent.put("zwnj", "\u200C"); // zero width non-joiner
		ent.put("zwj", "\u200D"); // zero width joiner
		ent.put("lrm", "\u200E"); // left-to-right mark
		ent.put("rlm", "\u200F"); // right-to-left mark
		ent.put("ndash", "\u2013"); // en dash
		ent.put("mdash", "\u2014"); // em dash
		ent.put("lsquo", "\u2018"); // left single quotation mark
		ent.put("rsquo", "\u2019"); // right single quotation mark
		ent.put("sbquo", "\u201A"); // single low-9 quotation mark
		ent.put("ldquo", "\u201C"); // left double quotation mark
		ent.put("rdquo", "\u201D"); // right double quotation mark
		ent.put("bdquo", "\u201E"); // double low-9 quotation mark
		ent.put("dagger", "\u2020"); // dagger
		ent.put("Dagger", "\u2021"); // double dagger
		ent.put("bull", "\u2022"); // bullet
		ent.put("hellip", "\u2026"); // horizontal ellipsis
		ent.put("permil", "\u2030"); // per mille sign
		ent.put("prime", "\u2032"); // prime
		ent.put("Prime", "\u2033"); // double prime
		ent.put("lsaquo", "\u2039"); // single left-pointing angle quotation mark
		ent.put("rsaquo", "\u203A"); // single right-pointing angle quotation mark
		ent.put("oline", "\u203E"); // overline
		ent.put("frasl", "\u2044"); // fraction slash
		ent.put("euro", "\u20AC"); // euro sign
		ent.put("image", "\u2111"); // black-letter capital i
		ent.put("weierp", "\u2118"); // script capital p (Weierstrass p)
		ent.put("real", "\u211C"); // black-letter capital r
		ent.put("trade", "\u2122"); // trademark sign
		ent.put("alefsym", "\u2135"); // alef symbol
		ent.put("larr", "\u2190"); // leftwards arrow
		ent.put("uarr", "\u2191"); // upwards arrow
		ent.put("rarr", "\u2192"); // rightwards arrow
		ent.put("darr", "\u2193"); // downwards arrow
		ent.put("harr", "\u2194"); // left right arrow
		ent.put("crarr", "\u21B5"); // downwards arrow with corner leftwards
		ent.put("lArr", "\u21D0"); // leftwards double arrow
		ent.put("uArr", "\u21D1"); // upwards double arrow
		ent.put("rArr", "\u21D2"); // rightwards double arrow
		ent.put("dArr", "\u21D3"); // downwards double arrow
		ent.put("hArr", "\u21D4"); // left right double arrow
		ent.put("forall", "\u2200"); // for all
		ent.put("part", "\u2202"); // partial differential
		ent.put("exist", "\u2203"); // there exists
		ent.put("empty", "\u2205"); // empty set
		ent.put("nabla", "\u2207"); // nabla
		ent.put("isin", "\u2208"); // element of
		ent.put("notin", "\u2209"); // not an element of
		ent.put("ni", "\u220B"); // contains as member
		ent.put("prod", "\u220F"); // n-ary product
		ent.put("sum", "\u2211"); // n-ary summation
		ent.put("minus", "\u2212"); // minus sign
		ent.put("lowast", "\u2217"); // asterisk operator
		ent.put("radic", "\u221A"); // square root
		ent.put("prop", "\u221D"); // proportional to
		ent.put("infin", "\u221E"); // infinity
		ent.put("ang", "\u2220"); // angle
		ent.put("and", "\u2227"); // logical and
		ent.put("or", "\u2228"); // logical or
		ent.put("cap", "\u2229"); // intersection
		ent.put("cup", "\u222A"); // union
		ent.put("int", "\u222B"); // integral
		ent.put("there4", "\u2234"); // therefore
		ent.put("sim", "\u223C"); // tilde operator
		ent.put("cong", "\u2245"); // congruent to
		ent.put("asymp", "\u2248"); // almost equal to
		ent.put("ne", "\u2260"); // not equal to
		ent.put("equiv", "\u2261"); // identical to (equivalent to)
		ent.put("le", "\u2264"); // less-than or equal to
		ent.put("ge", "\u2265"); // greater-than or equal to
		ent.put("sub", "\u2282"); // subset of
		ent.put("sup", "\u2283"); // superset of
		ent.put("nsub", "\u2284"); // not a subset of
		ent.put("sube", "\u2286"); // subset of or equal to
		ent.put("supe", "\u2287"); // superset of or equal to
		ent.put("oplus", "\u2295"); // circled plus
		ent.put("otimes", "\u2297"); // circled times
		ent.put("perp", "\u22A5"); // up tack (perpendicular sign in math)
		ent.put("sdot", "\u22C5"); // dot operator
		ent.put("lceil", "\u2308"); // left ceiling
		ent.put("rceil", "\u2309"); // right ceiling
		ent.put("lfloor", "\u230A"); // left floor
		ent.put("rfloor", "\u230B"); // right floor
		ent.put("lang", "\u2329"); // left-pointing angle bracket
		ent.put("rang", "\u232A"); // right-pointing angle bracket
		ent.put("loz", "\u25CA"); // lozenge
		ent.put("spades", "\u2660"); // black spade suit
		ent.put("clubs", "\u2663"); // black club suit
		ent.put("hearts", "\u2665"); // black heart suit
		ent.put("diams", "\u2666"); // 
		
		entities = Collections.unmodifiableMap(ent);
		
	}
	
	public static final Pattern entityRefPattern = Pattern.compile("&(\\w+|#\\d+{1,12}|#[Xx][0-9a-fA-F]{1,8});");
	
	public static String decodeEntities (String text) {
		return decodeEntities((CharSequence)text).toString();
	}
	
	public static CharSequence decodeEntities (CharSequence text) {
		//TODO: optionally normalize nbsp -> blank, etc
		
		if (text==null) return null;
		if (text.length()<4 || StringUtils.indexOf('&', text)<0) return text;
		
		Matcher m = entityRefPattern.matcher(text);
		StringBuffer s = new StringBuffer(text.length()*2);
		
		if (!m.find()) return text;
		
		do {
			String r = resolveEntity(m.group(1));
			m.appendReplacement(s, Matcher.quoteReplacement(r));
		} while (m.find());
		
		m.appendTail(s);
		return s;
	}

	public static String resolveEntity(String name) {
		if (name.charAt(0)=='#') {
			char ch = name.charAt(1);
			int code;
			
			if (ch=='x' || ch=='X') {
				code = Integer.parseInt(name.substring(2), 16);
			}
			else {
				code = Integer.parseInt(name.substring(1), 10);
			}
			
			return new String( new int [] { code }, 0, 1 ); //TODO: would it be faster to synchronize and use a persistent buffer?
		}
		else {
			String s = entities.get(name);
			if (s!=null) return s;
			else return "&"+name+";"; 
		}
	}

}
