Skip to content

Text Utilities

Tokenizer(tokenizer_type)

A Tokenizer class for both sub-word (pre-trained) and word (rule-based) level tokenization.

Source code in src/deeponto/utils/text_utils.py
96
97
98
99
def __init__(self, tokenizer_type: str):
    self.type = tokenizer_type
    self._tokenizer = None  # hidden tokenizer
    self.tokenize = None  # the tokenization method

from_pretrained(pretrained_path='bert-base-uncased') classmethod

(Based on transformers) Load a sub-word level tokenizer from pre-trained model.

Source code in src/deeponto/utils/text_utils.py
107
108
109
110
111
112
113
@classmethod
def from_pretrained(cls, pretrained_path: str = "bert-base-uncased"):
    """(Based on **transformers**) Load a sub-word level tokenizer from pre-trained model."""
    instance = cls("pre-trained")
    instance._tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
    instance.tokenize = instance._tokenizer.tokenize
    return instance

from_rule_based() classmethod

(Based on spacy) Load a word-level (rule-based) tokenizer.

Source code in src/deeponto/utils/text_utils.py
115
116
117
118
119
120
121
122
@classmethod
def from_rule_based(cls):
    """(Based on **spacy**) Load a word-level (rule-based) tokenizer."""
    spacy.prefer_gpu()
    instance = cls("rule-based")
    instance._tokenizer = English()
    instance.tokenize = lambda texts: [word.text for word in instance._tokenizer(texts).doc]
    return instance

InvertedIndex(index, tokenizer)

Inverted index built from a text index.

Attributes:

Name Type Description
tokenizer Tokenizer

A tokenizer instance to be used.

original_index defaultdict

A dictionary where the values are text strings to be tokenized.

constructed_index defaultdict

A dictionary that acts as the inverted index of original_index.

Source code in src/deeponto/utils/text_utils.py
134
135
136
137
138
139
140
141
def __init__(self, index: defaultdict, tokenizer: Tokenizer):
    self.tokenizer = tokenizer
    self.original_index = index
    self.constructed_index = defaultdict(list)
    for k, v in self.original_index.items():
        # value is a list of strings
        for token in self.tokenizer(v):
            self.constructed_index[token].append(k)

idf_select(texts, pool_size=200)

Given a list of tokens, select a set candidates based on the inverted document frequency (idf) scores.

We use idf instead of tf because labels have different lengths and thus tf is not a fair measure.

Source code in src/deeponto/utils/text_utils.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
def idf_select(self, texts: Union[str, List[str]], pool_size: int = 200):
    """Given a list of tokens, select a set candidates based on the inverted document frequency (idf) scores.

    We use `idf` instead of  `tf` because labels have different lengths and thus tf is not a fair measure.
    """
    candidate_pool = defaultdict(lambda: 0)
    # D := number of "documents", i.e., number of "keys" in the original index
    D = len(self.original_index)
    for token in self.tokenizer(texts):
        # each token is associated with some classes
        potential_candidates = self.constructed_index[token]
        if not potential_candidates:
            continue
        # We use idf instead of tf because the text for each class is of different length, tf is not a fair measure
        # inverse document frequency: with more classes to have the current token tk, the score decreases
        idf = math.log10(D / len(potential_candidates))
        for candidate in potential_candidates:
            # each candidate class is scored by sum(idf)
            candidate_pool[candidate] += idf
    candidate_pool = list(sorted(candidate_pool.items(), key=lambda item: item[1], reverse=True))
    # print(f"Select {min(len(candidate_pool), pool_size)} candidates.")
    # select the first K ranked
    return candidate_pool[:pool_size]

process_annotation_literal(annotation_literal, apply_lowercasing=False, normalise_identifiers=False)

Pre-process an annotation literal string.

Parameters:

Name Type Description Default
annotation_literal str

A literal string of an entity's annotation.

required
apply_lowercasing bool

A boolean that determines lowercasing or not. Defaults to False.

False
normalise_identifiers bool

Whether to normalise annotation text that is in the Java identifier format. Defaults to False.

False

Returns:

Type Description
str

the processed annotation literal string.

Source code in src/deeponto/utils/text_utils.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def process_annotation_literal(
    annotation_literal: str, apply_lowercasing: bool = False, normalise_identifiers: bool = False
):
    """Pre-process an annotation literal string.

    Args:
        annotation_literal (str): A literal string of an entity's annotation.
        apply_lowercasing (bool): A boolean that determines lowercasing or not. Defaults to `False`.
        normalise_identifiers (bool): Whether to normalise annotation text that is in the Java identifier format. Defaults to `False`.

    Returns:
        (str): the processed annotation literal string.
    """

    # replace the underscores with spaces
    annotation_literal = annotation_literal.replace("_", " ")

    # if the annotation literal is a valid identifier with first letter capitalised
    # we suspect that it could be a Java style identifier that needs to be split
    if normalise_identifiers and annotation_literal[0].isupper() and annotation_literal.isidentifier():
        annotation_literal = split_java_identifier(annotation_literal)

    # lowercase the annotation literal if specfied
    if apply_lowercasing:
        annotation_literal = annotation_literal.lower()

    return annotation_literal

split_java_identifier(java_style_identifier)

Split words in java's identifier style into natural language phrase.

Examples:

  • "SuperNaturalPower" \(\rightarrow\) "Super Natural Power"
  • "APIReference" \(\rightarrow\) "API Reference"
  • "Covid19" \(\rightarrow\) "Covid 19"
Source code in src/deeponto/utils/text_utils.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def split_java_identifier(java_style_identifier: str):
    r"""Split words in java's identifier style into natural language phrase.

    Examples:
        - `"SuperNaturalPower"` $\rightarrow$ `"Super Natural Power"`
        - `"APIReference"` $\rightarrow$ `"API Reference"`
        - `"Covid19"` $\rightarrow$ `"Covid 19"`
    """
    # split at every capital letter or number (numbers are treated as capital letters)
    raw_words = re.findall("([0-9A-Z][a-z]*)", java_style_identifier)
    words = []
    capitalized_word = ""
    for i, w in enumerate(raw_words):
        # the above regex pattern will split at capitals
        # so the capitalized words are split into characters
        # i.e., (len(w) == 1)
        if len(w) == 1:
            capitalized_word += w
            # edge case for the last word
            if i == len(raw_words) - 1:
                words.append(capitalized_word)

        # if the the current w is a full word, save the previous
        # cached capitalized_word and also save current full word
        elif capitalized_word:
            words.append(capitalized_word)
            words.append(w)
            capitalized_word = ""

        # just save the current full word otherwise
        else:
            words.append(w)

    return " ".join(words)

Last update: February 1, 2023
Created: January 14, 2023
GitHub: @Lawhy   Personal Page: yuanhe.wiki