Better handling of quoted attributes.

Aug 4, 2012 at 4:37 PM
Edited Aug 4, 2012 at 4:50 PM

The code in HtmlTag.cs:54ish to handle attributes fails when quoted attributes contain '='. For a more robust solution, change:

//Extract attributes
MatchCollection atts = Parser.Match(Parser.HmlTagAttributes, tag);

foreach (Match att in atts)
{
    //Extract attribute and value
    string[] chunks = att.Value.Split('=');

    if (chunks.Length == 1)
    {
        if(!Attributes.ContainsKey(chunks[0]))
            Attributes.Add(chunks[0].ToLower(), string.Empty);
    }
    else if (chunks.Length == 2)
    {
        string attname = chunks[0].Trim();
        string attvalue = chunks[1].Trim();

        if (attvalue.StartsWith("\"") && attvalue.EndsWith("\"") && attvalue.Length > 2)
        {
            attvalue = attvalue.Substring(1, attvalue.Length - 2);
        }

        if (!Attributes.ContainsKey(attname))
            Attributes.Add(attname, attvalue);
    }
}

to

// Extract attributes
foreach (Match att in Parser.Match(Parser.HmlTagAttributes, tag))
{
    string attname = att.Groups["name"].Value.ToLower();
    if (!this.Attributes.ContainsKey(attname))
    {
        // attvalue will be string.Empty for attributes without an '='
        string attvalue = System.Web.HttpUtility.HtmlDecode(
            att.Groups[att.Groups["q"].Success ? "valueq" : "value"].Value);
        this.Attributes.Add(attname, attvalue);
    }
}

with additional changes to Parser.cs. First, because I like pretty regular expressions, change:

public static MatchCollection Match(string regex, string source)
{
    Regex r = new Regex(regex, RegexOptions.IgnoreCase | RegexOptions.Singleline);
    return r.Matches(source);
}

to

public static MatchCollection Match(string regex, string source)
{
    var r = new Regex(regex, RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace);
    return r.Matches(source);
}

Then change

public const string CssColors = @"(#\S{6}|#\S{3}|rgb\(\s*[0-9]{1,3}\%?\s*\,\s*[0-9]{1,3}\%?\s*\,\s*[0-9]{1,3}\%?\s*\)|maroon|red|orange|yellow|olive|purple|fuchsia|white|lime|green|navy|blue|aqua|teal|black|silver|gray)";

to

public const string CssColors =
    @"
        ( \#\S{6}
            | \#\S{3}
            | rgb\(\s*[0-9]{1,3}\%?\s*\,\s*[0-9]{1,3}\%?\s*\,\s*[0-9]{1,3}\%?\s*\)
            | maroon | red | orange | yellow | olive | purple | fuchsia | white 
            | lime | green | navy | blue | aqua | teal | black | silver | gray
        )";

(this must be changed because the IgnorePatternWhitespace causes unescaped '#' to start a comment) 

Finally, change

public const string HmlTagAttributes = "[^\\s]*\\s*=\\s*(\"[^\"]*\"|[^\\s]*)";

to

/// <summary>
/// Extracts attributes from a HTML tag; e.g. att=value, att="value".
/// </summary>
/// <remarks>
/// A match will always have a "name" group and may have an optional
/// "value" or "valueq" group. If it has a "valueq" group than it will
/// also have a "q" group saying what kind of quote was used to enclose 
/// the "valueq".
/// </remarks>
public const string HmlTagAttributes =
    @"(?<name>[^\s=]+)                 # attribute name
        (?:                            # optional value block
            \s* = \s*                  # separated by '='
            (?:                        # quoted or unquoted
                (?<q>['""])            # single or double quote
                (?<valueq>             # quoted value
                    (?:.*?) 
                )
                \k<q>                  # match single or double quote
            | 
            (?<value>[^\s]+))
        )?";
//// Could add:
////     (?<!\\) (?>\\\\)*    # can't end in odd slash count
//// to the end of valueq to allow escaping of quotes with a back 
//// slash. This wouldn't match HTML spec but is allowed some places.
//// The resulting value in valueq would have to be post-processed
//// to remove the escaped characters.

You will notice, that with these changes, you not only fix the issue with '=' in a quoted string, you get single and double quoted strings plus processing of HTML &xxx; text - all as per specification.

Developer
Nov 13, 2012 at 2:50 PM

fixed in 1.1.0.0

thx