The html data tag table uses five quad words of 64 bits per table entry. The first quad word spacifies the length of the html tag in characters. The second and third quad words specify the mask to be used to clear unwanted bits from the two registers used to compare the tag with the table. Finally, the fourth and fifth quad words specify the actual tag in string format (notice the strings are reversed due to being held in little endian format). Using this approach a nice tight tidy loop can be used to get the tag identifer (the row in the table).
html_tags dq 1, 000000000000000FFH, 00000000000000000H, "a", 0 dq 1, 000000000000000FFH, 00000000000000000H, "b", 0 dq 1, 000000000000000FFH, 00000000000000000H, "i", 0 dq 1, 000000000000000FFH, 00000000000000000H, "p", 0 dq 1, 000000000000000FFH, 00000000000000000H, "q", 0 dq 1, 000000000000000FFH, 00000000000000000H, "s", 0 dq 1, 000000000000000FFH, 00000000000000000H, "u", 0 dq 2, 0000000000000FFFFH, 00000000000000000H, "rb", 0 dq 2, 0000000000000FFFFH, 00000000000000000H, "dd", 0 dq 2, 0000000000000FFFFH, 00000000000000000H, "ld", 0 dq 2, 0000000000000FFFFH, 00000000000000000H, "td", 0 dq 2, 0000000000000FFFFH, 00000000000000000H, "me", 0 dq 2, 0000000000000FFFFH, 00000000000000000H, "1h", 0 dq 2, 0000000000000FFFFH, 00000000000000000H, "2h", 0 dq 2, 0000000000000FFFFH, 00000000000000000H, "3h", 0 dq 2, 0000000000000FFFFH, 00000000000000000H, "4h", 0 dq 2, 0000000000000FFFFH, 00000000000000000H, "5h", 0 dq 2, 0000000000000FFFFH, 00000000000000000H, "6h", 0 dq 2, 0000000000000FFFFH, 00000000000000000H, "rh", 0 dq 2, 0000000000000FFFFH, 00000000000000000H, "il", 0 dq 2, 0000000000000FFFFH, 00000000000000000H, "lo", 0 dq 2, 0000000000000FFFFH, 00000000000000000H, "br", 0 dq 2, 0000000000000FFFFH, 00000000000000000H, "pr", 0 dq 2, 0000000000000FFFFH, 00000000000000000H, "tr", 0 dq 2, 0000000000000FFFFH, 00000000000000000H, "dt", 0 dq 2, 0000000000000FFFFH, 00000000000000000H, "ht", 0 dq 2, 0000000000000FFFFH, 00000000000000000H, "rt", 0 dq 2, 0000000000000FFFFH, 00000000000000000H, "lu", 0 dq 3, 00000000000FFFFFFH, 00000000000000000H, "idb", 0 dq 3, 00000000000FFFFFFH, 00000000000000000H, "odb", 0 dq 3, 00000000000FFFFFFH, 00000000000000000H, "loc", 0 dq 3, 00000000000FFFFFFH, 00000000000000000H, "led", 0 dq 3, 00000000000FFFFFFH, 00000000000000000H, "nfd", 0 dq 3, 00000000000FFFFFFH, 00000000000000000H, "vid", 0 dq 3, 00000000000FFFFFFH, 00000000000000000H, "gmi", 0 dq 3, 00000000000FFFFFFH, 00000000000000000H, "sni", 0 dq 3, 00000000000FFFFFFH, 00000000000000000H, "dbk", 0 dq 3, 00000000000FFFFFFH, 00000000000000000H, "pam", 0 dq 3, 00000000000FFFFFFH, 00000000000000000H, "van", 0 dq 3, 00000000000FFFFFFH, 00000000000000000H, "erp", 0 dq 3, 00000000000FFFFFFH, 00000000000000000H, "ctr", 0 dq 3, 00000000000FFFFFFH, 00000000000000000H, "bus", 0 dq 3, 00000000000FFFFFFH, 00000000000000000H, "pus", 0 dq 3, 00000000000FFFFFFH, 00000000000000000H, "rav", 0 dq 3, 00000000000FFFFFFH, 00000000000000000H, "rbw", 0 dq 3, 00000000000FFFFFFH, 00000000000000000H, "gvs", 0 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "rbba", 0 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "aera", 0 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "esab", 0 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "ydob", 0 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "etic", 0 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "edoc", 0 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "atad", 0 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "mrof", 0 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "daeh", 0 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "lmth", 0 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "knil", 0 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "niam", 0 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "kram", 0 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "unem", 0 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "atem", 0 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "ybur", 0 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "pmas", 0 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "naps", 0 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "emit", 0 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "htap", 0 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "edisa", 0 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "oidua", 0 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "debme", 0 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "tupni", 0 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "lebal", 0 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "retem", 0 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "marap", 0 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "llams", 0 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "elyts", 0 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "elbat", 0 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "ydobt", 0 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "tooft", 0 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "daeht", 0 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "eltit", 0 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "kcart", 0 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "oediv", 0 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "retnec", 0 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "nottub", 0 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "savnac", 0 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "golaid", 0 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "erugif", 0 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "retoof", 0 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "redaeh", 0 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "puorgh", 0 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "emarfi", 0 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "negyek", 0 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "dnegel", 0 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "tcejbo", 0 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "noitpo", 0 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "tuptuo", 0 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "tpircs", 0 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "tceles", 0 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "ecruos", 0 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "gnorts", 0 dq 7, 000FFFFFFFFFFFFFFH, 00000000000000000H, "sserdda", 0 dq 7, 000FFFFFFFFFFFFFFH, 00000000000000000H, "elcitra", 0 dq 7, 000FFFFFFFFFFFFFFH, 00000000000000000H, "noitpac", 0 dq 7, 000FFFFFFFFFFFFFFH, 00000000000000000H, "sliated", 0 dq 7, 000FFFFFFFFFFFFFFH, 00000000000000000H, "noitces", 0 dq 7, 000FFFFFFFFFFFFFFH, 00000000000000000H, "yrammus", 0 dq 8, 0FFFFFFFFFFFFFFFFH, 00000000000000000H, "epytcod!", 0 dq 8, 0FFFFFFFFFFFFFFFFH, 00000000000000000H, "puorgloc", 0 dq 8, 0FFFFFFFFFFFFFFFFH, 00000000000000000H, "tsilatad", 0 dq 8, 0FFFFFFFFFFFFFFFFH, 00000000000000000H, "tesdleif", 0 dq 8, 0FFFFFFFFFFFFFFFFH, 00000000000000000H, "metiunem", 0 dq 8, 0FFFFFFFFFFFFFFFFH, 00000000000000000H, "tpircson", 0 dq 8, 0FFFFFFFFFFFFFFFFH, 00000000000000000H, "puorgtpo", 0 dq 8, 0FFFFFFFFFFFFFFFFH, 00000000000000000H, "ssergorp", 0 dq 8, 0FFFFFFFFFFFFFFFFH, 00000000000000000H, "etalpmet", 0 dq 8, 0FFFFFFFFFFFFFFFFH, 00000000000000000H, "aeratxet", 0 dq 10, 0FFFFFFFFFFFFFFFFH, 0000000000000FFFFH, "ouqkcolb", "et" dq 10, 0FFFFFFFFFFFFFFFFH, 0000000000000FFFFH, "itpacgif", "no" db 0
Refinements to the pseudo code described in Part II of this tutorial
As with all things, even pseudo code can be optimised when it actually comes to the coding aspect. In writing the code, I attempted to follow the pseudo code as written, but realised that there were a few things missing. I will not bore you with the details, but simply present the final parse code to you.
title Parse_Html option casemap:none node struct next_node dq ? previous_node dq ? parent_node dq ? first_child dq ? last_child dq ? html_tag dq ? html_attribute_string dq ? html_text_string dq ? node ends .data html_tags dq 1, 000000000000000FFH, 00000000000000000H, "a", 0 ; 0 dq 1, 000000000000000FFH, 00000000000000000H, "b", 0 ; 1 dq 1, 000000000000000FFH, 00000000000000000H, "i", 0 ; 2 dq 1, 000000000000000FFH, 00000000000000000H, "p", 0 ; 3 dq 1, 000000000000000FFH, 00000000000000000H, "q", 0 ; 4 dq 1, 000000000000000FFH, 00000000000000000H, "s", 0 ; 5 dq 1, 000000000000000FFH, 00000000000000000H, "u", 0 ; 6 dq 2, 0000000000000FFFFH, 00000000000000000H, "rb", 0 ; 7 dq 2, 0000000000000FFFFH, 00000000000000000H, "dd", 0 ; 8 dq 2, 0000000000000FFFFH, 00000000000000000H, "ld", 0 ; 9 dq 2, 0000000000000FFFFH, 00000000000000000H, "td", 0 ; 10 dq 2, 0000000000000FFFFH, 00000000000000000H, "me", 0 ; 11 dq 2, 0000000000000FFFFH, 00000000000000000H, "1h", 0 ; 12 dq 2, 0000000000000FFFFH, 00000000000000000H, "2h", 0 ; 13 dq 2, 0000000000000FFFFH, 00000000000000000H, "3h", 0 ; 14 dq 2, 0000000000000FFFFH, 00000000000000000H, "4h", 0 ; 15 dq 2, 0000000000000FFFFH, 00000000000000000H, "5h", 0 ; 16 dq 2, 0000000000000FFFFH, 00000000000000000H, "6h", 0 ; 17 dq 2, 0000000000000FFFFH, 00000000000000000H, "rh", 0 ; 18 dq 2, 0000000000000FFFFH, 00000000000000000H, "il", 0 ; 19 dq 2, 0000000000000FFFFH, 00000000000000000H, "lo", 0 ; 20 dq 2, 0000000000000FFFFH, 00000000000000000H, "br", 0 ; 21 dq 2, 0000000000000FFFFH, 00000000000000000H, "pr", 0 ; 22 dq 2, 0000000000000FFFFH, 00000000000000000H, "tr", 0 ; 23 dq 2, 0000000000000FFFFH, 00000000000000000H, "dt", 0 ; 24 dq 2, 0000000000000FFFFH, 00000000000000000H, "ht", 0 ; 25 dq 2, 0000000000000FFFFH, 00000000000000000H, "rt", 0 ; 26 dq 2, 0000000000000FFFFH, 00000000000000000H, "lu", 0 ; 27 dq 3, 00000000000FFFFFFH, 00000000000000000H, "idb", 0 ; 28 dq 3, 00000000000FFFFFFH, 00000000000000000H, "odb", 0 ; 29 dq 3, 00000000000FFFFFFH, 00000000000000000H, "loc", 0 ; 30 dq 3, 00000000000FFFFFFH, 00000000000000000H, "led", 0 ; 31 dq 3, 00000000000FFFFFFH, 00000000000000000H, "nfd", 0 ; 32 dq 3, 00000000000FFFFFFH, 00000000000000000H, "vid", 0 ; 33 dq 3, 00000000000FFFFFFH, 00000000000000000H, "gmi", 0 ; 34 dq 3, 00000000000FFFFFFH, 00000000000000000H, "sni", 0 ; 35 dq 3, 00000000000FFFFFFH, 00000000000000000H, "dbk", 0 ; 36 dq 3, 00000000000FFFFFFH, 00000000000000000H, "pam", 0 ; 37 dq 3, 00000000000FFFFFFH, 00000000000000000H, "van", 0 ; 38 dq 3, 00000000000FFFFFFH, 00000000000000000H, "erp", 0 ; 39 dq 3, 00000000000FFFFFFH, 00000000000000000H, "ctr", 0 ; 40 dq 3, 00000000000FFFFFFH, 00000000000000000H, "bus", 0 ; 41 dq 3, 00000000000FFFFFFH, 00000000000000000H, "pus", 0 ; 42 dq 3, 00000000000FFFFFFH, 00000000000000000H, "rav", 0 ; 43 dq 3, 00000000000FFFFFFH, 00000000000000000H, "rbw", 0 ; 44 dq 3, 00000000000FFFFFFH, 00000000000000000H, "gvs", 0 ; 45 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "rbba", 0 ; 46 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "aera", 0 ; 47 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "esab", 0 ; 48 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "ydob", 0 ; 49 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "etic", 0 ; 50 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "edoc", 0 ; 51 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "atad", 0 ; 52 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "mrof", 0 ; 53 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "daeh", 0 ; 54 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "lmth", 0 ; 55 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "knil", 0 ; 56 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "niam", 0 ; 57 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "kram", 0 ; 58 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "unem", 0 ; 59 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "atem", 0 ; 60 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "ybur", 0 ; 61 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "pmas", 0 ; 62 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "naps", 0 ; 63 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "emit", 0 ; 64 dq 4, 000000000FFFFFFFFH, 00000000000000000H, "htap", 0 ; 65 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "edisa", 0 ; 66 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "oidua", 0 ; 67 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "debme", 0 ; 68 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "tupni", 0 ; 69 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "lebal", 0 ; 70 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "retem", 0 ; 71 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "marap", 0 ; 72 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "llams", 0 ; 73 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "elyts", 0 ; 74 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "elbat", 0 ; 75 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "ydobt", 0 ; 76 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "tooft", 0 ; 77 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "daeht", 0 ; 78 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "eltit", 0 ; 79 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "kcart", 0 ; 80 dq 5, 0000000FFFFFFFFFFH, 00000000000000000H, "oediv", 0 ; 81 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "retnec", 0 ; 82 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "nottub", 0 ; 83 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "savnac", 0 ; 84 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "golaid", 0 ; 85 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "erugif", 0 ; 86 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "retoof", 0 ; 87 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "redaeh", 0 ; 88 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "puorgh", 0 ; 89 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "emarfi", 0 ; 90 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "negyek", 0 ; 91 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "dnegel", 0 ; 92 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "tcejbo", 0 ; 93 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "noitpo", 0 ; 94 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "tuptuo", 0 ; 95 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "tpircs", 0 ; 96 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "tceles", 0 ; 97 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "ecruos", 0 ; 98 dq 6, 00000FFFFFFFFFFFFH, 00000000000000000H, "gnorts", 0 ; 99 dq 7, 000FFFFFFFFFFFFFFH, 00000000000000000H, "sserdda", 0 ; 100 dq 7, 000FFFFFFFFFFFFFFH, 00000000000000000H, "elcitra", 0 ; 101 dq 7, 000FFFFFFFFFFFFFFH, 00000000000000000H, "noitpac", 0 ; 102 dq 7, 000FFFFFFFFFFFFFFH, 00000000000000000H, "sliated", 0 ; 103 dq 7, 000FFFFFFFFFFFFFFH, 00000000000000000H, "noitces", 0 ; 104 dq 7, 000FFFFFFFFFFFFFFH, 00000000000000000H, "yrammus", 0 ; 105 dq 8, 0FFFFFFFFFFFFFFFFH, 00000000000000000H, "epytcod!", 0 ; 106 dq 8, 0FFFFFFFFFFFFFFFFH, 00000000000000000H, "puorgloc", 0 ; 107 dq 8, 0FFFFFFFFFFFFFFFFH, 00000000000000000H, "tsilatad", 0 ; 108 dq 8, 0FFFFFFFFFFFFFFFFH, 00000000000000000H, "tesdleif", 0 ; 109 dq 8, 0FFFFFFFFFFFFFFFFH, 00000000000000000H, "metiunem", 0 ; 110 dq 8, 0FFFFFFFFFFFFFFFFH, 00000000000000000H, "tpircson", 0 ; 111 dq 8, 0FFFFFFFFFFFFFFFFH, 00000000000000000H, "puorgtpo", 0 ; 112 dq 8, 0FFFFFFFFFFFFFFFFH, 00000000000000000H, "ssergorp", 0 ; 113 dq 8, 0FFFFFFFFFFFFFFFFH, 00000000000000000H, "etalpmet", 0 ; 114 dq 8, 0FFFFFFFFFFFFFFFFH, 00000000000000000H, "aeratxet", 0 ; 115 dq 10, 0FFFFFFFFFFFFFFFFH, 0000000000000FFFFH, "ouqkcolb", "et" ; 116 dq 10, 0FFFFFFFFFFFFFFFFH, 0000000000000FFFFH, "itpacgif", "no" ; 117 start_comment dd '--!<' end_comment dd '>--' end_script_tag dq 'tpircs/<' end_style_tag dq '>elyts/<' .code ; ; r15 points to the source ; r14 points to the free workspace ; Parse_Html proc first_node: qword, last_node: qword mov first_node, rcx mov last_node, rdx create_node: mov rdi, r14 add r14, sizeof node hunt_for_tag: mov eax, dword ptr [rsi] cmp al, 21 je end_of_parse cmp al, '<' je found_start_tag add rsi, 1 jmp hunt_for_tag found_start_tag: cmp eax, start_comment jne not_start_comment ; ; Process the comment ; hunt_for_end_comment: add rsi, 1 mov eax, dword ptr [rsi] and eax, 0FFFFFFH cmp eax, end_comment jne hunt_for_end_comment add rsi, 3 jmp hunt_for_tag not_start_comment: cmp ah, '/' je start_end_tag xor r10, r10 process_tag: mov r13, r14 add rsi, 1 mov al, byte ptr [rsi] copy_html_tag: add rsi, 1 add r14, 1 mov byte ptr [r14], al inc byte ptr [r13] mov al, byte ptr [rsi] cmp al, '0' jb tag_no_number cmp al, '9' jbe copy_html_tag tag_no_number: or al, 020H cmp al, 'a' jb end_start_html_tag cmp al, 'z' ja end_start_html_tag jmp copy_html_tag end_start_html_tag: add r14, 1 ; ; Find the tag in the table ; lea r12, html_tags xor rdx, rdx xor rcx, rcx mov bl, byte ptr [r13] find_html_tag_length_loop: mov rdx, qword ptr [r12] or rdx, rdx je tag_not_found cmp bl, dl je identify_html_tag lea r12, 40[r12] add rcx, 1 jmp find_html_tag_length_loop identify_html_tag: mov r8, qword ptr 1[r13] mov r9, qword ptr 9[r13] and r8, qword ptr 8[r12] and r9, qword ptr 16[r12] cmp r8, qword ptr 24[r12] jne next_tag_in_table cmp r9, qword ptr 32[r12] jne next_tag_in_table mov node.html_tag[rdi], rcx jmp tag_found next_tag_in_table: lea r12, 40[r12] add rcx, 1 mov rdx, qword ptr [r12] or rdx, rdx je tag_not_found jmp identify_html_tag ; ; Find the start of the attributes or end of tag ; tag_found: mov al, byte ptr [rsi] cmp al, ' ' je find_non_space cmp al, 009H je find_non_space cmp al, 00DH je find_non_space cmp al, 00AH jne found_non_space find_non_space: add rsi, 1 jmp tag_found found_non_space: cmp al, '>' je found_end_attributes cmp al, '/' je nearly_found_end_attributes ; ; We have found the start of the attributes ; mov node.html_attribute_string[rdi], r14 find_end_of_attributes: mov byte ptr [r14], al add rsi, 1 add r14, 1 mov al, byte ptr [rsi] cmp al, '"' je skip_to_end_double_quotes cmp al, "'" je skip_to_end_single_quotes cmp al, '>' je found_end_attributes cmp al, '/' je nearly_found_end_attributes jmp find_end_of_attributes skip_to_end_double_quotes: mov byte ptr [r14], al add rsi, 1 add r14, 1 mov al, byte ptr [rsi] cmp al, '"' je find_end_of_attributes jmp skip_to_end_double_quotes skip_to_end_single_quotes: mov byte ptr [r14], al add rsi, 1 add r14, 1 mov al, byte ptr [rsi] cmp al, '"' je find_end_of_attributes jmp skip_to_end_single_quotes nearly_found_end_attributes: add rsi, 1 mov al, byte ptr [rsi] cmp al, '>' jne nearly_found_end_attributes found_end_attributes: xor al, al mov byte ptr [r14], al add r14, 1 add rsi, 1 or r10, r10 je next_sub_section mov rax, node.html_tag[rdi] ret ; ; Need to know if this is a self terminating tag ; next_sub_section: mov rbx, node.html_tag[rdi] ; ; Special case for <script> ; cmp rbx, 96 je process_script ; ; Special case for <style> ; cmp rbx, 74 je process_style cmp rbx, 7 je self_terminating cmp rbx, 18 je self_terminating cmp rbx, 30 je self_terminating cmp rbx, 34 je self_terminating cmp rbx, 44 je self_terminating cmp rbx, 47 je self_terminating cmp rbx, 48 je self_terminating cmp rbx, 56 je self_terminating cmp rbx, 60 je self_terminating cmp rbx, 68 je self_terminating cmp rbx, 69 je self_terminating cmp rbx, 72 je self_terminating cmp rbx, 80 je self_terminating cmp rbx, 91 je self_terminating cmp rbx, 98 je self_terminating cmp rbx, 106 je self_terminating ; ; Skip spaces etc ; find_next_non_space: mov al, byte ptr [rsi] cmp al, ' ' je skip_space cmp al, 009H je skip_space cmp al, 00DH je skip_space cmp al, 00AH jne check_for_text skip_space: add rsi, 1 jmp find_next_non_space check_for_text: cmp al, '<' je recursive_call ; ; Non space characters after the start tag need to be remembered ; mov node.html_text_string[rdi], r14 copy_text_string: mov byte ptr [r14], al add r14, 1 add rsi, 1 mov al, byte ptr [rsi] cmp al, '<' jne copy_text_string add r14, 1 ; ; Apply recursive call to Parse_Html ; recursive_call: add r14, 1 push rdi lea rdx, node.last_child[rdi] lea rcx, node.first_child[rdi] sub rsp, 16 call Parse_Html add rsp, 16 pop rdi cmp node.html_tag[rdi], rax je self_terminating int 3 ; ; Process self terminating tags ; self_terminating: mov rcx, first_node mov rdx, last_node mov rax, qword ptr [rcx] or rax, rax jne not_first_node mov qword ptr [rcx], rdi jmp chain_to_end not_first_node: mov rax, qword ptr [rdx] mov node.next_node[rax], rdi chain_to_end: mov rax, qword ptr [rdx] mov node.previous_node[rdi], rax mov qword ptr [rdx], rdi jmp create_node tag_not_found: int 3 start_end_tag: add rsi, 1 xor r10, r10 inc r10 jmp process_tag ; ; Search for </script> end tag ; process_script: mov node.html_text_string[rdi], r14 process_script_loop: mov rax, qword ptr [rsi] mov rbx, end_script_tag cmp rax, rbx je recursive_call mov byte ptr [r14], al add rsi, 1 add r14, 1 jmp process_script_loop ; ; Search for </style> end tag ; process_style: mov node.html_text_string[rdi], r14 process_style_loop: mov rax, qword ptr [rsi] mov rbx, end_style_tag cmp rax, rbx je recursive_call mov byte ptr [r14], al add rsi, 1 add r14, 1 jmp process_style_loop end_of_parse: ret Parse_Html endp end
Testing
The 'harness' code you will need is:-
title HTMLParse option casemap:none FILE_ATTRIBUTE_NORMAL = 080H OPEN_EXISTING = 3 GENERIC_READ = 080000000H MEM_COMMIT = 01000H PAGE_READWRITE = 040H CreateFileA proto :qword, :qword, :qword, :qword, :qword, :qword, :qword GetFileSize proto :qword, :qword ReadFile proto :qword, :qword, :qword, :qword, :qword CloseHandle proto :qword VirtualAlloc proto :qword, :qword, :qword, :qword Parse_Html proto :qword, :qword, :qword node struct next_node dq ? previous_node dq ? parent_node dq ? first_child dq ? last_child dq ? node_tag dq ? node_attribute_string dq ? node_text_string dq ? node_text_length dq ? node ends .data filename db 'index.html', 0 bytes_read dq 0 node_base dq 0 first_node dq 0 last_node dq 0 .code MainStartup proc ; ; Read the file into a buffer ; xor rax, rax push rax push FILE_ATTRIBUTE_NORMAL push OPEN_EXISTING sub rsp, 32 mov r9, rax mov r8, rax mov rdx, GENERIC_READ lea rcx, filename call CreateFileA add rsp, 56 mov r15, rax xor rdx, rdx mov rcx, r15 sub rsp, 16 call GetFileSize add rsp, 16 mov r14, rax mov r9, PAGE_READWRITE mov r8, MEM_COMMIT mov rdx, r14 inc rdx xor rcx, rcx sub rsp, 32 call VirtualAlloc add rsp, 32 mov r13, rax mov al, 21 mov byte ptr [r13+r14], al xor rax, rax push rax sub rsp, 32 lea r9, bytes_read mov r8, r14 mov rdx, r13 mov rcx, r15 call ReadFile add rsp, 40 sub rsp, 8 mov rcx, r15 call CloseHandle add rsp, 8 mov rax, r14 mov rbx, sizeof node imul rbx ; ; Compute memory for the number of nodes required to parse ; I have done this by saying that there cannot be more than ; the number of bytes in the html file. For example, a 100 ; byte file would allocate 100 nodes - an overkill but it ; will speed things up considerably as we do not have to use ; the heap! ; mov r14, rax mov r9, PAGE_READWRITE mov r8, MEM_COMMIT mov rdx, r14 inc rdx xor rcx, rcx sub rsp, 32 call VirtualAlloc add rsp, 32 mov r15, rax mov rsi, r13 ; ; Get memory for the string area ; mov r9, PAGE_READWRITE mov r8, MEM_COMMIT mov rdx, 10000000 xor rcx, rcx sub rsp, 32 call VirtualAlloc add rsp, 32 mov r14, rax ; ; Now begin the parse of the HTML. RSI contains the source ; base and R15 contains the address for the current node ; and R14 contains the string area address ; mov eax, dword ptr [rsi] and eax, 0FFFFFFH cmp eax, 0BFBBEFH jne parse_file add rsi, 3 parse_file: lea rcx, first_node lea rdx, last_node sub rsp, 16 call Parse_Html add rsp, 16 xor rax, rax ret MainStartup endp end
Remember to set the 'linker->advanced->Entry Point' option to MainStartup and include an 'index.html' file to parse in the source directory.
If you have any questions about this set of tutorials, please leave a reply and I will get back to you as soon as I can.
This post has been edited by Martyn.Rae: 07 July 2017 - 05:52 AM