Saturday, October 11, 2014

Parsers and tokenizers (Python)

Here I am collecting simple (no third-party libraries) functions for parsing and tokenization of frequent formats of texts.
From Wikipedia:
Tokenization is the process of breaking a stream of text up into words, phrases, symbols, or other meaningful elements called tokens. The list of tokens becomes input for further processing such as parsing or text mining. Tokenization is useful both in linguistics (where it is a form of text segmentation), and in computer science, where it forms part of lexical analysis.

1. Tokenizers for English text

An evaluation function to estimate quality of tokenization:
def estimate_quality(tokenizer, show_mismatch = False):
    #Author: Ruslan Mavlyutov, m-ceros@yandex.ru. Please don't remove this line.
    import re
    text = """Belfort's firm was eventually shut down (1998) and he was indicted for securities fraud and money laundering[21]
    Of the US$11.6 million that has been recovered by Belfort's victims, US$10.4 million of the total...
    ... formed a multi-state task force...
    "Jordan Belfort - The Wolf of Wall Street". YouTube. 2010-07-05. Retrieved 2014_05_14.
    http://en.wikipedia.org/wiki/Jordan_Belfort
    """
    ground_truth = ["Belfort", "'s", 'firm', 'was', 'eventually', 'shut', 'down', '(', '1998', ')',
                   'and', 'he', 'was', 'indicted', 'for', 'securities', 'fraud', 'and', 'money', 'laundering', 
                    '[', '21', ']', '\n', 'Of', 'the', 'US$11.6', 'million', 'that', 'has', 'been', 'recovered', 
                    'by', "Belfort", "'s", 'victims', ',', 'US$10.4', 'million', 'of', 'the', 'total', '...', '\n', 
                    '...', 'formed', 'a', 'multi-state', 'task', 'force', '...', '\n', '"', 'Jordan', 'Belfort', 
                    '-', 'The', 'Wolf', 'of', 'Wall', 'Street', '"', '.', 'YouTube', '.', '2010-07-05', '.', 
                    'Retrieved', '2014_05_14', '.', '\n',  'http://en.wikipedia.org/wiki/Jordan_Belfort', '\n']
    result = tokenizer(text)
    if not result:
        return 0.0
    ground_index = 0
    result_index = 0
    ground_pos = 0
    result_pos = 0
    matched = 0
    mismatches = []
    while ground_index < len(ground_truth) and result_index < len(result):
        result_pos = text.find(result[result_index], result_pos)
        ground_pos = text.find(ground_truth[ground_index], ground_pos)
        if result_pos == ground_pos and result[result_index] == ground_truth[ground_index]:
            matched += 1
            ground_pos += len(ground_truth[ground_index]) # do not allow overlap with previous match
            result_pos += len(result[result_index])
            result_index += 1
            ground_index += 1
            continue
        result_end_pos = result_pos + len(result[result_index])
        ground_end_pos = ground_pos + len(ground_truth[ground_index])
        if result_end_pos == ground_end_pos:
            mismatches += ["gr:" + ground_truth[ground_index], "res:" + result[result_index]]
            ground_pos += len(ground_truth[ground_index])
            result_pos += len(result[result_index])
            result_index += 1
            ground_index += 1
            continue
        elif result_end_pos > ground_end_pos:
            mismatches += ["gr:" + ground_truth[ground_index]]
            ground_pos += len(ground_truth[ground_index])
            ground_index += 1
        else:
            mismatches += ["res:" + result[result_index]]
            result_pos += len(result[result_index])
            result_index += 1
    mismatches += ["gr:" + item for item in ground_truth[ground_index:]] + ["res:" + item for item in result[result_index:]]   
    if show_mismatch and mismatches:
        print mismatches
    precision = float(matched) / len(result)
    recall = float(matched) / len(ground_truth)
    fmes = 2 * precision * recall / (precision + recall)
    return fmes
I want punctuation and possessives to be separated from words; however, I also want to keep words like “2010-07-05”, “US$10.4” and “multi-state” as one word. New line signs should not be dropped, and html links should stay as one word.
The very simple tokenization which is suitable for matching with Google Ngrams, splitting web queries and in all cases where you are not going to do morphology and syntax on top of it:
def tokenization_simple(text):
    #Author: Ruslan Mavlyutov, m-ceros@yandex.ru. Please don't remove this line.
    import re
    words = [word for word in re.findall("[0-9a-zA-Z_]+|[^\s0-9a-zA-Z]", text)]
    return words
Yields 0.66 F-measure:
print estimate_quality(tokenization_simple, True)
> ["res:'", "gr:'s", 'res:s', 'gr:\n', 'res:US', 'res:$', 'res:11', 'res:.', 'gr:US$11.6', 
'res:6', "res:'", "gr:'s", 'res:s', 'res:US', 'res:$', 'res:10', 'res:.', 'gr:US$10.4', 
'res:4', 'res:.', 'res:.', 'gr:...', 'res:.', 'gr:\n', 'res:.', 'res:.', 'gr:...', 'res:.', 
'res:multi', 'res:-', 'gr:multi-state', 'res:state', 'res:.', 'res:.', 'gr:...', 'res:.', 
'gr:\n', 'res:2010', 'res:-', 'res:07', 'res:-', 'gr:2010-07-05', 'res:05', 'gr:\n', 
'res:http', 'res::', 'res:/', 'res:/', 'res:en', 'res:.', 'res:wikipedia', 'res:.', 
'res:org', 'res:/', 'res:wiki', 'res:/', 'gr:http://en.wikipedia.org/wiki/Jordan_Belfort', 
'res:Jordan_Belfort', 'gr:\n']
> 0.662857142857
A small improvement for web links (might be useful):
def tokenization_simple_with_links(text):
    #Author: Ruslan Mavlyutov, m-ceros@yandex.ru. Please don't remove this line.
    import re
    words = [word for word in \
     re.findall("ftps*:\/\/[^\s]+|https*:\/\/[^\s]+|www\.[^\s]+|[0-9a-zA-Z]+|[^\s0-9a-zA-Z]", text)]
    return words
Yields 0.695 F-measure. Same mistakes, except that we managed to join parts of the http-link in the end of the text:
print estimate_quality(tokenization_simple_with_links, True)
> ["res:'", "gr:'s", 'res:s', 'gr:\n', 'res:US', 'res:$', 'res:11', 'res:.', 'gr:US$11.6', 
'res:6', "res:'", "gr:'s", 'res:s', 'res:US', 'res:$', 'res:10', 'res:.', 'gr:US$10.4', 
'res:4', 'res:.', 'res:.', 'gr:...', 'res:.', 'gr:\n', 'res:.', 'res:.', 'gr:...', 'res:.', 
'res:multi', 'res:-', 'gr:multi-state', 'res:state', 'res:.', 'res:.', 'gr:...', 'res:.', 
'gr:\n', 'res:2010', 'res:-', 'res:07', 'res:-', 'gr:2010-07-05', 'res:05', 'res:2014', 
'res:_', 'res:05', 'res:_', 'gr:2014_05_14', 'res:14', 'gr:\n', 'gr:\n']
> 0.694610778443
Finally, a solution which does the job without mistakes:
def tokenization_deep(text):
    #Author: Ruslan Mavlyutov, m-ceros@yandex.ru. Please don't remove this line.
    import re
    white_space_pattern = "\s"
    possessive_pattern = "'s"
    http_pattern = "ftps*:\/\/[^\s]+|https*:\/\/[^\s]+|www\.[^\s]+"
    word_chunk_pattern = "[a-zA-Z0-9]+"
    punctuation = "\"'!?\.,;:\-\)\(\]\[\}\{"
    always_divide_punct = "[](){},;"
    punct_pattern = "\.\.\.|[%s]" % (punctuation)
    other_pattern = "[^\sa-zA-Z0-9\"'!?.,;:-]"
    sub_words = [word for word in re.findall("|".join([white_space_pattern, possessive_pattern, 
                                                       http_pattern, word_chunk_pattern, punct_pattern, 
                                                       other_pattern]), text)]
    sub_words += [" "] # don't want to process tail cases separately
    types = ""
    for chunk in sub_words:
        if chunk in " \t\n":
            types += 's'
            continue
        elif chunk in punctuation or chunk == "...":
            types += 'p'
        elif (chunk[0] >= '0' and chunk[0] <= '9') or \
             (chunk[0] >= 'A' and chunk[0] <= 'Z') or \
             (chunk[0] >= 'a' and chunk[0] <= 'z'):
            types += 'w'
        else:
            types += 'o'
    words = []
    word = ""
    for chunk_index in xrange(len(sub_words)):
        if types[chunk_index] == 's':
            if word:
                words += [word]
                word = ""
            if sub_words[chunk_index] == "\n":
                words += [sub_words[chunk_index]]
        elif types[chunk_index] == 'p' or sub_words[chunk_index] == "'s":
            separate_from_prev_word = True
            if word and not sub_words[chunk_index] in always_divide_punct:
                for next_chunk_index in xrange(chunk_index + 1, len(sub_words)):
                    if types[next_chunk_index] == 's':
                        break
                    elif types[next_chunk_index] in 'wo':
                        separate_from_prev_word = False
                        break
            if not separate_from_prev_word:
                word += sub_words[chunk_index]
            else:
                if word:
                    words += [word]
                    word = ""
                words += [sub_words[chunk_index]]
        else:
            word += sub_words[chunk_index]   
    return words
Result:
print estimate_quality(tokenization_deep, True)
> 1.0

2. Parsers

From Wikipedia:
A parser is a software component that takes input data (frequently text) and builds a data structure – often some kind of parse tree, abstract syntax tree or other hierarchical structure – giving a structural representation of the input, checking for correct syntax in the process.

2.1 XML(HTML) parser

NOTE: The goal here is not to create a full-weight HTML-parser. For that kind of task one should use HTMLParser or something similar.
Here we are going to leave a plain text and an information about the initial markup in a separate data structure.
The code:
def parse_tag_info(tag):
    #Author: Ruslan Mavlyutov, http://factex.blogspot.ch/. Please don't remove this line.
    import re
    tag = tag[1:-1].replace("\n", " ")
    tag = ("/" == tag[-1] and tag[:-1] or "/" == tag[0] and tag[1:] or tag).strip()
    if not "=" in tag:
        name = tag
        fields = []
    else:
        name = tag[:tag.index(" ")]
        fields_dump = tag[tag.index(" "):].strip()
        fields_dump = [item.strip() for item in  fields_dump.split("=")]
        fields = [[fields_dump[0], ""], ]
        for item in fields_dump[1:]:
            value = item
            if " " in item:
                fields[-1][-1] = item[:item.rfind(" ")]
                fields += [[item[item.rfind(" ") + 1:], ""]]
            else:
                fields[-1][-1] = item.strip()
        fields = [(key.strip(), value.strip()) for key, value in fields]
    return name.lower(), fields


def parse_html(html, ignore_script=False, ignore_comments=False):
    #Author: Ruslan Mavlyutov, http://factex.blogspot.ch/. Please don't remove this line.
    import re
    chunks = re.findall("<\![^>]+>|<[/]*[a-zA-Z]+[^>]*/*>|[^<>]+", html)
    text = ""
    markup = []
    markup_stack = []
    for chunk in chunks:
        is_tag = chunk.startswith("<") and not chunk.startswith("<!")
        if is_tag:
            is_close = chunk.startswith("</") or chunk.startswith("/>")
            is_open = not chunk.startswith("</")
            name, fields = parse_tag_info(chunk)
            if is_open:
                markup_stack += [[name, fields, len(text)]]
            if is_close:
                for stack_pos in xrange(len(markup_stack) - 1, -1, -1):
                    if markup_stack[stack_pos][0] == name:
                        name, fields, start = markup_stack[stack_pos]
                        markup += [(name, fields, start, len(text))]
                        markup_stack = markup_stack[:stack_pos] + markup_stack[stack_pos + 1:]
                        break
        else:  
            if chunk.startswith("<!DOCTYPE"):
                continue
            if ignore_script and markup_stack and markup_stack[-1][0] == "script":
                continue
            if ignore_comments and chunk.startswith("<!--"):
                continue
            text += " " + chunk
    return text, markup

The main step is separating tags from what is between them. We are doing this with the regular expression:
chunks = re.findall("<\![^>]+>|<[/]*[a-zA-Z]+[^>]*/*>|[^<>]+", html)
Next we go through the chunks and put them into two piles: a final text and a markup stack. As we have a matched pair (open tag, close tag), we are removing the open tag from the stack and put it into the array of final markup ranges. Each markup range has information about its start and end position in the final text, a tag name and an array of the tag attributes. Function parse_tag_info does parsing of a tag attributes. We may ignore content of scripts and meta-comments (<!--...-->;). It is useful if you are going to put the final text into an NLP-parser without filtering by tags. Let’s print text of last 3 paragraphs from the Wikipedia page about parsing:
import urllib2
html = urllib2.urlopen("http://en.wikipedia.org/wiki/Parsing").read()
text, markup = parse_html(html)
last3p = [text[start:end] for name, fields, start, end in markup if name == "p"][-3:]
for parag in last3p:
    print parag
>  The parse tree and resulting code from it is not correct according to language semantics.
>  To correctly parse without lookahead, there are three solutions:
>  The parse tree generated is correct and simply more efficient [ citation needed ]  than non-lookahead parsers. This is the strategy followed in  LALR parsers .
And first 10 links with their text:
import urllib2
html = urllib2.urlopen("http://en.wikipedia.org/wiki/Parsing").read()
text, markup = parse_html(html)
first10a = [(text[start:end], fields) for name, fields, start, end in markup if name == "a"][:10]
for link_text, fields in first10a:
    hrefs = [value for key, value in fields if key == "href"]
    href = hrefs and hrefs[0] or ""
    print href, link_text
>  
> "#mw-navigation"  navigation
> "#p-search"  search
> "/wiki/Parse_(disambiguation)"  Parse (disambiguation)
> "/wiki/Parser_(CGI_language)"  Parser (CGI language)
> "/wiki/String_(computer_science)"  string
> "/wiki/Natural_language"  natural language
> "/wiki/Computer_languages"  computer languages
> "/wiki/Formal_grammar"  formal grammar
> "/wiki/Part_of_speech"  part (of speech)

If you just print the final text, you will see a lot of white spaces. It is important to preserve them if you want to use the markup information.


Another crucial function is decoding HTML entities (like “&amp;”, or “&#Xe5;”). The code and an example of usage:

def unescape_html_entities(text_unicode, log_replacements=False):
    #Author: Ruslan Mavlyutov, http://factex.blogspot.ch/. Please don't remove this line.
    import re
    from htmlentitydefs import entitydefs
    while True:
        replaced = 0
        entities2replace = set([entity for entity in re.findall("&[a-z]+;|&#[xX]*[0-9a-f]+;", text_unicode)])
        for entity in entities2replace:
            code = entity[1:-1]
            if code in entitydefs:
                #allows to keep same text indentation
                replace_with = ('{:>' + str(len(entity)) + '}').format(unichr(ord(entitydefs[code])))
                replaced += 1
            elif code.startswith("#x") or code.startswith("#X"): #hexadecimal
                decimal = int(code[2:], 16)
                replace_with = ('{:>' + str(len(entity)) + '}').format(unichr(decimal))
                replaced += 1    
            elif code.startswith("#"):
                replace_with = ('{:>' + str(len(entity)) + '}').format(unichr(int(code[1:])))
                replaced += 1
            else:
                replace_with = entity
            if log_replacements:
                print entity.decode("utf8"), "->", replace_with.decode("utf8")
            text_unicode = text_unicode.replace(entity, replace_with)
        if not replaced:
            break
    return text_unicode

#example:
import urllib2
html = urllib2.urlopen("http://www.w3.org/TR/html4/charset.html#h-5.3").read()
text, markup = parse_html(html)
text = unescape_html_entities(text.decode("utf8"), True)
> &amp; ->     &
> &nbsp; ->      
> &lt; ->     <
> &gt; ->    >
> &#229; ->      å
> &quot; ->      "
> &#1048; ->       И
> &aring; ->       å
> &#Xe5; ->      å
> &amp; ->     &

To be continued.

No comments:

Post a Comment