Page 1 of 1

Parsing HTML in x64 assembly - Part III

#1 Martyn.Rae  Icon User is offline

  • The programming dinosaur
  • member icon

Reputation: 544
  • View blog
  • Posts: 1,413
  • Joined: 22-August 09

Posted 07 July 2017 - 05:51 AM

The HTML Tag Data Table

The html data tag table uses five quad words of 64 bits per table entry. The first quad word spacifies the length of the html tag in characters. The second and third quad words specify the mask to be used to clear unwanted bits from the two registers used to compare the tag with the table. Finally, the fourth and fifth quad words specify the actual tag in string format (notice the strings are reversed due to being held in little endian format). Using this approach a nice tight tidy loop can be used to get the tag identifer (the row in the table).

html_tags                           dq        1,  000000000000000FFH, 00000000000000000H, "a",        0   
                                    dq        1,  000000000000000FFH, 00000000000000000H, "b",        0   
                                    dq        1,  000000000000000FFH, 00000000000000000H, "i",        0   
                                    dq        1,  000000000000000FFH, 00000000000000000H, "p",        0   
                                    dq        1,  000000000000000FFH, 00000000000000000H, "q",        0   
                                    dq        1,  000000000000000FFH, 00000000000000000H, "s",        0   
                                    dq        1,  000000000000000FFH, 00000000000000000H, "u",        0   
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "rb",       0   
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "dd",       0   
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "ld",       0   
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "td",       0   
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "me",       0   
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "1h",       0   
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "2h",       0   
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "3h",       0   
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "4h",       0   
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "5h",       0   
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "6h",       0   
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "rh",       0   
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "il",       0   
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "lo",       0   
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "br",       0   
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "pr",       0   
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "tr",       0   
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "dt",       0   
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "ht",       0   
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "rt",       0   
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "lu",       0   
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "idb",      0   
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "odb",      0   
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "loc",      0   
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "led",      0   
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "nfd",      0   
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "vid",      0   
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "gmi",      0   
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "sni",      0   
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "dbk",      0   
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "pam",      0   
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "van",      0   
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "erp",      0   
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "ctr",      0   
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "bus",      0   
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "pus",      0   
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "rav",      0   
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "rbw",      0   
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "gvs",      0   
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "rbba",     0   
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "aera",     0   
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "esab",     0   
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "ydob",     0   
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "etic",     0   
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "edoc",     0   
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "atad",     0   
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "mrof",     0   
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "daeh",     0   
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "lmth",     0   
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "knil",     0   
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "niam",     0   
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "kram",     0   
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "unem",     0   
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "atem",     0   
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "ybur",     0   
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "pmas",     0   
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "naps",     0   
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "emit",     0   
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "htap",     0   
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "edisa",    0   
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "oidua",    0   
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "debme",    0   
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "tupni",    0   
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "lebal",    0   
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "retem",    0   
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "marap",    0   
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "llams",    0   
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "elyts",    0   
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "elbat",    0   
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "ydobt",    0   
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "tooft",    0   
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "daeht",    0   
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "eltit",    0   
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "kcart",    0   
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "oediv",    0   
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "retnec",   0   
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "nottub",   0   
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "savnac",   0   
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "golaid",   0   
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "erugif",   0   
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "retoof",   0   
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "redaeh",   0   
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "puorgh",   0   
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "emarfi",   0   
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "negyek",   0   
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "dnegel",   0   
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "tcejbo",   0   
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "noitpo",   0   
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "tuptuo",   0   
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "tpircs",   0   
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "tceles",   0   
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "ecruos",   0   
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "gnorts",   0   
                                    dq        7,  000FFFFFFFFFFFFFFH, 00000000000000000H, "sserdda",  0   
                                    dq        7,  000FFFFFFFFFFFFFFH, 00000000000000000H, "elcitra",  0   
                                    dq        7,  000FFFFFFFFFFFFFFH, 00000000000000000H, "noitpac",  0   
                                    dq        7,  000FFFFFFFFFFFFFFH, 00000000000000000H, "sliated",  0   
                                    dq        7,  000FFFFFFFFFFFFFFH, 00000000000000000H, "noitces",  0   
                                    dq        7,  000FFFFFFFFFFFFFFH, 00000000000000000H, "yrammus",  0   
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "epytcod!", 0   
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "puorgloc", 0   
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "tsilatad", 0   
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "tesdleif", 0   
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "metiunem", 0   
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "tpircson", 0   
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "puorgtpo", 0   
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "ssergorp", 0   
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "etalpmet", 0   
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "aeratxet", 0   
                                    dq        10, 0FFFFFFFFFFFFFFFFH, 0000000000000FFFFH, "ouqkcolb", "et"
                                    dq        10, 0FFFFFFFFFFFFFFFFH, 0000000000000FFFFH, "itpacgif", "no"
                                    db        0



Refinements to the pseudo code described in Part II of this tutorial

As with all things, even pseudo code can be optimised when it actually comes to the coding aspect. In writing the code, I attempted to follow the pseudo code as written, but realised that there were a few things missing. I will not bore you with the details, but simply present the final parse code to you.

                                    title     Parse_Html
                                    option    casemap:none

node                                struct
    next_node                       dq        ?
    previous_node                   dq        ?
    parent_node                     dq        ?
    first_child                     dq        ?
    last_child                      dq        ?
    html_tag                        dq        ?
    html_attribute_string           dq        ?
    html_text_string                dq        ?
node                                ends

                                    .data
html_tags                           dq        1,  000000000000000FFH, 00000000000000000H, "a",        0          ; 0
                                    dq        1,  000000000000000FFH, 00000000000000000H, "b",        0          ; 1
                                    dq        1,  000000000000000FFH, 00000000000000000H, "i",        0          ; 2
                                    dq        1,  000000000000000FFH, 00000000000000000H, "p",        0          ; 3
                                    dq        1,  000000000000000FFH, 00000000000000000H, "q",        0          ; 4
                                    dq        1,  000000000000000FFH, 00000000000000000H, "s",        0          ; 5
                                    dq        1,  000000000000000FFH, 00000000000000000H, "u",        0          ; 6
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "rb",       0          ; 7
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "dd",       0          ; 8
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "ld",       0          ; 9
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "td",       0          ; 10
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "me",       0          ; 11
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "1h",       0          ; 12
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "2h",       0          ; 13
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "3h",       0          ; 14
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "4h",       0          ; 15
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "5h",       0          ; 16
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "6h",       0          ; 17
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "rh",       0          ; 18
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "il",       0          ; 19
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "lo",       0          ; 20
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "br",       0          ; 21
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "pr",       0          ; 22
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "tr",       0          ; 23
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "dt",       0          ; 24
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "ht",       0          ; 25
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "rt",       0          ; 26
                                    dq        2,  0000000000000FFFFH, 00000000000000000H, "lu",       0          ; 27
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "idb",      0          ; 28
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "odb",      0          ; 29
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "loc",      0          ; 30
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "led",      0          ; 31
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "nfd",      0          ; 32
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "vid",      0          ; 33
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "gmi",      0          ; 34
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "sni",      0          ; 35
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "dbk",      0          ; 36
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "pam",      0          ; 37
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "van",      0          ; 38
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "erp",      0          ; 39
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "ctr",      0          ; 40
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "bus",      0          ; 41
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "pus",      0          ; 42
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "rav",      0          ; 43
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "rbw",      0          ; 44
                                    dq        3,  00000000000FFFFFFH, 00000000000000000H, "gvs",      0          ; 45
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "rbba",     0          ; 46
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "aera",     0          ; 47
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "esab",     0          ; 48
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "ydob",     0          ; 49
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "etic",     0          ; 50
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "edoc",     0          ; 51
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "atad",     0          ; 52
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "mrof",     0          ; 53
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "daeh",     0          ; 54
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "lmth",     0          ; 55
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "knil",     0          ; 56
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "niam",     0          ; 57
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "kram",     0          ; 58
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "unem",     0          ; 59
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "atem",     0          ; 60
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "ybur",     0          ; 61
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "pmas",     0          ; 62
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "naps",     0          ; 63
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "emit",     0          ; 64
                                    dq        4,  000000000FFFFFFFFH, 00000000000000000H, "htap",     0          ; 65
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "edisa",    0          ; 66
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "oidua",    0          ; 67
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "debme",    0          ; 68
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "tupni",    0          ; 69
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "lebal",    0          ; 70
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "retem",    0          ; 71
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "marap",    0          ; 72
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "llams",    0          ; 73
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "elyts",    0          ; 74
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "elbat",    0          ; 75
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "ydobt",    0          ; 76
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "tooft",    0          ; 77
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "daeht",    0          ; 78
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "eltit",    0          ; 79
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "kcart",    0          ; 80
                                    dq        5,  0000000FFFFFFFFFFH, 00000000000000000H, "oediv",    0          ; 81
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "retnec",   0          ; 82
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "nottub",   0          ; 83
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "savnac",   0          ; 84
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "golaid",   0          ; 85
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "erugif",   0          ; 86
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "retoof",   0          ; 87
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "redaeh",   0          ; 88
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "puorgh",   0          ; 89
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "emarfi",   0          ; 90
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "negyek",   0          ; 91
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "dnegel",   0          ; 92
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "tcejbo",   0          ; 93
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "noitpo",   0          ; 94
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "tuptuo",   0          ; 95
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "tpircs",   0          ; 96
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "tceles",   0          ; 97
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "ecruos",   0          ; 98
                                    dq        6,  00000FFFFFFFFFFFFH, 00000000000000000H, "gnorts",   0          ; 99
                                    dq        7,  000FFFFFFFFFFFFFFH, 00000000000000000H, "sserdda",  0          ; 100
                                    dq        7,  000FFFFFFFFFFFFFFH, 00000000000000000H, "elcitra",  0          ; 101
                                    dq        7,  000FFFFFFFFFFFFFFH, 00000000000000000H, "noitpac",  0          ; 102
                                    dq        7,  000FFFFFFFFFFFFFFH, 00000000000000000H, "sliated",  0          ; 103
                                    dq        7,  000FFFFFFFFFFFFFFH, 00000000000000000H, "noitces",  0          ; 104
                                    dq        7,  000FFFFFFFFFFFFFFH, 00000000000000000H, "yrammus",  0          ; 105
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "epytcod!", 0          ; 106
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "puorgloc", 0          ; 107
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "tsilatad", 0          ; 108
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "tesdleif", 0          ; 109
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "metiunem", 0          ; 110
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "tpircson", 0          ; 111
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "puorgtpo", 0          ; 112
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "ssergorp", 0          ; 113
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "etalpmet", 0          ; 114
                                    dq        8,  0FFFFFFFFFFFFFFFFH, 00000000000000000H, "aeratxet", 0          ; 115
                                    dq        10, 0FFFFFFFFFFFFFFFFH, 0000000000000FFFFH, "ouqkcolb", "et"       ; 116
                                    dq        10, 0FFFFFFFFFFFFFFFFH, 0000000000000FFFFH, "itpacgif", "no"       ; 117
                                                                                                                 
start_comment                       dd        '--!<'
end_comment                         dd        '>--'
end_script_tag                      dq        'tpircs/<'
end_style_tag                       dq        '>elyts/<'

                                    .code

;
;   r15 points to the source
;   r14 points to the free workspace
;

Parse_Html                          proc      first_node: qword, last_node: qword
                                    mov       first_node, rcx
                                    mov       last_node, rdx
create_node:                        mov       rdi, r14
                                    add       r14, sizeof node
hunt_for_tag:                       mov       eax, dword ptr [rsi]
                                    cmp       al, 21
                                    je        end_of_parse
                                    cmp       al, '<'
                                    je        found_start_tag
                                    add       rsi, 1
                                    jmp       hunt_for_tag
found_start_tag:                    cmp       eax, start_comment
                                    jne       not_start_comment
                                    
                                    ;
                                    ; Process the comment
                                    ;

hunt_for_end_comment:               add       rsi, 1
                                    mov       eax, dword ptr [rsi]
                                    and       eax, 0FFFFFFH
                                    cmp       eax, end_comment
                                    jne       hunt_for_end_comment
                                    add       rsi, 3
                                    jmp       hunt_for_tag
not_start_comment:                  cmp       ah, '/'
                                    je        start_end_tag
                                    xor       r10, r10
process_tag:                        mov       r13, r14
                                    add       rsi, 1
                                    mov       al, byte ptr [rsi]
copy_html_tag:                      add       rsi, 1
                                    add       r14, 1
                                    mov       byte ptr [r14], al
                                    inc       byte ptr [r13]
                                    mov       al, byte ptr [rsi]
                                    cmp       al, '0'
                                    jb        tag_no_number
                                    cmp       al, '9'
                                    jbe       copy_html_tag
tag_no_number:                      or        al, 020H
                                    cmp       al, 'a'
                                    jb        end_start_html_tag
                                    cmp       al, 'z'
                                    ja        end_start_html_tag
                                    jmp       copy_html_tag
end_start_html_tag:                 add       r14, 1
                                    ;
                                    ;   Find the tag in the table
                                    ;
                                    lea       r12, html_tags
                                    xor       rdx, rdx
                                    xor       rcx, rcx
                                    mov       bl, byte ptr [r13]
find_html_tag_length_loop:          mov       rdx, qword ptr [r12]
                                    or        rdx, rdx
                                    je        tag_not_found
                                    cmp       bl, dl
                                    je        identify_html_tag
                                    lea       r12, 40[r12]
                                    add       rcx, 1
                                    jmp       find_html_tag_length_loop
identify_html_tag:                  mov       r8, qword ptr 1[r13]
                                    mov       r9, qword ptr 9[r13]
                                    and       r8, qword ptr 8[r12]
                                    and       r9, qword ptr 16[r12]
                                    cmp       r8, qword ptr 24[r12]
                                    jne       next_tag_in_table
                                    cmp       r9, qword ptr 32[r12]
                                    jne       next_tag_in_table
                                    mov       node.html_tag[rdi], rcx
                                    jmp       tag_found
next_tag_in_table:                  lea       r12, 40[r12]
                                    add       rcx, 1
                                    mov       rdx, qword ptr [r12]
                                    or        rdx, rdx
                                    je        tag_not_found
                                    jmp       identify_html_tag

                                    ;
                                    ; Find the start of the attributes or end of tag
                                    ;

tag_found:                          mov       al, byte ptr [rsi]
                                    cmp       al, ' '
                                    je        find_non_space
                                    cmp       al, 009H
                                    je        find_non_space
                                    cmp       al, 00DH
                                    je        find_non_space
                                    cmp       al, 00AH
                                    jne       found_non_space
find_non_space:                     add       rsi, 1
                                    jmp       tag_found
found_non_space:                    cmp       al, '>'
                                    je        found_end_attributes
                                    cmp       al, '/'
                                    je        nearly_found_end_attributes

                                    ;
                                    ; We have found the start of the attributes
                                    ;

                                    mov       node.html_attribute_string[rdi], r14
find_end_of_attributes:             mov       byte ptr [r14], al
                                    add       rsi, 1
                                    add       r14, 1
                                    mov       al, byte ptr [rsi]
                                    cmp       al, '"'
                                    je        skip_to_end_double_quotes
                                    cmp       al, "'"
                                    je        skip_to_end_single_quotes
                                    cmp       al, '>'
                                    je        found_end_attributes
                                    cmp       al, '/'
                                    je        nearly_found_end_attributes
                                    jmp       find_end_of_attributes
skip_to_end_double_quotes:          mov       byte ptr [r14], al
                                    add       rsi, 1
                                    add       r14, 1
                                    mov       al, byte ptr [rsi]
                                    cmp       al, '"'
                                    je        find_end_of_attributes
                                    jmp       skip_to_end_double_quotes
skip_to_end_single_quotes:          mov       byte ptr [r14], al
                                    add       rsi, 1
                                    add       r14, 1
                                    mov       al, byte ptr [rsi]
                                    cmp       al, '"'
                                    je        find_end_of_attributes
                                    jmp       skip_to_end_single_quotes
nearly_found_end_attributes:        add       rsi, 1
                                    mov       al, byte ptr [rsi]
                                    cmp       al, '>'
                                    jne       nearly_found_end_attributes
found_end_attributes:               xor       al, al
                                    mov       byte ptr [r14], al 
                                    add       r14, 1                                   
                                    add       rsi, 1
                                    or        r10, r10
                                    je        next_sub_section
                                    mov       rax, node.html_tag[rdi]
                                    ret

                                    ;
                                    ; Need to know if this is a self terminating tag
                                    ;

next_sub_section:                   mov       rbx, node.html_tag[rdi]

                                    ;
                                    ; Special case for <script>
                                    ;

                                    cmp       rbx, 96
                                    je        process_script

                                    ;
                                    ; Special case for <style>
                                    ;

                                    cmp       rbx, 74
                                    je        process_style

                                    cmp       rbx, 7
                                    je        self_terminating
                                    cmp       rbx, 18
                                    je        self_terminating
                                    cmp       rbx, 30
                                    je        self_terminating
                                    cmp       rbx, 34
                                    je        self_terminating
                                    cmp       rbx, 44
                                    je        self_terminating
                                    cmp       rbx, 47
                                    je        self_terminating
                                    cmp       rbx, 48
                                    je        self_terminating
                                    cmp       rbx, 56
                                    je        self_terminating
                                    cmp       rbx, 60
                                    je        self_terminating
                                    cmp       rbx, 68
                                    je        self_terminating
                                    cmp       rbx, 69
                                    je        self_terminating
                                    cmp       rbx, 72
                                    je        self_terminating
                                    cmp       rbx, 80
                                    je        self_terminating
                                    cmp       rbx, 91
                                    je        self_terminating
                                    cmp       rbx, 98
                                    je        self_terminating
                                    cmp       rbx, 106
                                    je        self_terminating

                                    ;
                                    ; Skip spaces etc
                                    ;

find_next_non_space:                mov       al, byte ptr [rsi]
                                    cmp       al, ' '
                                    je        skip_space
                                    cmp       al, 009H
                                    je        skip_space
                                    cmp       al, 00DH
                                    je        skip_space
                                    cmp       al, 00AH
                                    jne       check_for_text
skip_space:                         add       rsi, 1
                                    jmp       find_next_non_space
check_for_text:                     cmp       al, '<'
                                    je        recursive_call

                                    ;
                                    ; Non space characters after the start tag need to be remembered
                                    ;

                                    mov       node.html_text_string[rdi], r14
copy_text_string:                   mov       byte ptr [r14], al
                                    add       r14, 1
                                    add       rsi, 1
                                    mov       al, byte ptr [rsi]
                                    cmp       al, '<'
                                    jne       copy_text_string
                                    add       r14, 1

                                    ;
                                    ; Apply recursive call to Parse_Html
                                    ;

recursive_call:                     add       r14, 1
                                    push      rdi
                                    lea       rdx, node.last_child[rdi]
                                    lea       rcx, node.first_child[rdi]
                                    sub       rsp, 16
                                    call      Parse_Html
                                    add       rsp, 16
                                    pop       rdi
                                    cmp       node.html_tag[rdi], rax
                                    je        self_terminating
                                    int       3

                                    ;
                                    ; Process self terminating tags
                                    ;

self_terminating:                   mov       rcx, first_node
                                    mov       rdx, last_node
                                    mov       rax, qword ptr [rcx]
                                    or        rax, rax
                                    jne       not_first_node
                                    mov       qword ptr [rcx], rdi
                                    jmp       chain_to_end
not_first_node:                     mov       rax, qword ptr [rdx]
                                    mov       node.next_node[rax], rdi
chain_to_end:                       mov       rax, qword ptr [rdx]
                                    mov       node.previous_node[rdi], rax
                                    mov       qword ptr [rdx], rdi
                                    jmp       create_node                           
tag_not_found:                      int       3
start_end_tag:                      add       rsi, 1
                                    xor       r10, r10
                                    inc       r10
                                    jmp       process_tag

                                    ;
                                    ; Search for </script> end tag
                                    ;

process_script:                     mov       node.html_text_string[rdi], r14
process_script_loop:                mov       rax, qword ptr [rsi]
                                    mov       rbx, end_script_tag
                                    cmp       rax, rbx
                                    je        recursive_call
                                    mov       byte ptr [r14], al
                                    add       rsi, 1
                                    add       r14, 1
                                    jmp       process_script_loop

                                    ;
                                    ; Search for </style> end tag
                                    ;

process_style:                      mov       node.html_text_string[rdi], r14
process_style_loop:                 mov       rax, qword ptr [rsi]
                                    mov       rbx, end_style_tag
                                    cmp       rax, rbx
                                    je        recursive_call
                                    mov       byte ptr [r14], al
                                    add       rsi, 1
                                    add       r14, 1
                                    jmp       process_style_loop
end_of_parse:                       ret

Parse_Html                          endp
                                    end



Testing

The 'harness' code you will need is:-

                                    title     HTMLParse
                                    option    casemap:none

FILE_ATTRIBUTE_NORMAL               =         080H
OPEN_EXISTING                       =         3
GENERIC_READ                        =         080000000H
MEM_COMMIT                          =         01000H
PAGE_READWRITE                      =         040H

CreateFileA                         proto     :qword, :qword, :qword, :qword, :qword, :qword, :qword
GetFileSize                         proto     :qword, :qword
ReadFile                            proto     :qword, :qword, :qword, :qword, :qword
CloseHandle                         proto     :qword
VirtualAlloc                        proto     :qword, :qword, :qword, :qword
Parse_Html                          proto     :qword, :qword, :qword

node                                struct
    next_node                       dq        ?
    previous_node                   dq        ?
    parent_node                     dq        ?
    first_child                     dq        ?
    last_child                      dq        ?
    node_tag                        dq        ?
    node_attribute_string           dq        ?
    node_text_string                dq        ?
    node_text_length                dq        ?
node                                ends

                                    .data
filename                            db        'index.html', 0
bytes_read                          dq        0
node_base                           dq        0
first_node                          dq        0
last_node                           dq        0

                                    .code

MainStartup                         proc
                                    ;
                                    ;    Read the file into a buffer
                                    ;
                                    xor       rax, rax
                                    push      rax
                                    push      FILE_ATTRIBUTE_NORMAL
                                    push      OPEN_EXISTING
                                    sub       rsp, 32
                                    mov       r9, rax
                                    mov       r8, rax
                                    mov       rdx, GENERIC_READ
                                    lea       rcx, filename
                                    call      CreateFileA
                                    add       rsp, 56
                                    mov       r15, rax
                                    xor       rdx, rdx
                                    mov       rcx, r15
                                    sub       rsp, 16
                                    call      GetFileSize
                                    add       rsp, 16
                                    mov       r14, rax
                                    mov       r9, PAGE_READWRITE
                                    mov       r8, MEM_COMMIT
                                    mov       rdx, r14
                                    inc       rdx
                                    xor       rcx, rcx
                                    sub       rsp, 32
                                    call      VirtualAlloc
                                    add       rsp, 32
                                    mov       r13, rax
                                    mov       al, 21
                                    mov       byte ptr [r13+r14], al
                                    xor       rax, rax
                                    push      rax
                                    sub       rsp, 32
                                    lea       r9, bytes_read
                                    mov       r8, r14
                                    mov       rdx, r13
                                    mov       rcx, r15
                                    call      ReadFile
                                    add       rsp, 40
                                    sub       rsp, 8
                                    mov       rcx, r15
                                    call      CloseHandle
                                    add       rsp, 8
                                    mov       rax, r14
                                    mov       rbx, sizeof node
                                    imul      rbx
                                    ;
                                    ;    Compute memory for the number of nodes required to parse
                                    ;    I have done this by saying that there cannot be more than
                                    ;    the number of bytes in the html file. For example, a 100
                                    ;    byte file would allocate 100 nodes - an overkill but it
                                    ;    will speed things up considerably as we do not have to use
                                    ;    the heap!
                                    ;
                                    mov       r14, rax
                                    mov       r9, PAGE_READWRITE
                                    mov       r8, MEM_COMMIT
                                    mov       rdx, r14
                                    inc       rdx
                                    xor       rcx, rcx
                                    sub       rsp, 32
                                    call      VirtualAlloc
                                    add       rsp, 32
                                    mov       r15, rax
                                    mov       rsi, r13
                                    ;
                                    ;    Get memory for the string area
                                    ;
                                    mov       r9, PAGE_READWRITE
                                    mov       r8, MEM_COMMIT
                                    mov       rdx, 10000000
                                    xor       rcx, rcx
                                    sub       rsp, 32
                                    call      VirtualAlloc
                                    add       rsp, 32
                                    mov       r14, rax

                                    ;
                                    ;    Now begin the parse of the HTML. RSI contains the source
                                    ;    base and R15 contains the address for the current node
                                    ;    and R14 contains the string area address
                                    ;

                                    mov       eax, dword ptr [rsi]
                                    and       eax, 0FFFFFFH
                                    cmp       eax, 0BFBBEFH
                                    jne       parse_file
                                    add       rsi, 3
parse_file:                         lea       rcx, first_node
                                    lea       rdx, last_node
                                    sub       rsp, 16
                                    call      Parse_Html
                                    add       rsp, 16
                                    xor       rax, rax
                                    ret
MainStartup                         endp
                                    end



Remember to set the 'linker->advanced->Entry Point' option to MainStartup and include an 'index.html' file to parse in the source directory.

If you have any questions about this set of tutorials, please leave a reply and I will get back to you as soon as I can.

This post has been edited by Martyn.Rae: 07 July 2017 - 05:52 AM


Is This A Good Question/Topic? 0
  • +

Page 1 of 1