Skip to content

File Utilities

create_path(path)

Create a path recursively.

Source code in src/deeponto/utils/file_utils.py
29
30
31
def create_path(path: str):
    """Create a path recursively."""
    Path(path).mkdir(parents=True, exist_ok=True)

save_file(obj, save_path, sort_keys=False)

Save an object to a certain format.

Source code in src/deeponto/utils/file_utils.py
34
35
36
37
38
39
40
41
42
43
44
45
46
def save_file(obj, save_path: str, sort_keys: bool = False):
    """Save an object to a certain format."""
    if save_path.endswith(".json"):
        with open(save_path, "w") as output:
            json.dump(obj, output, indent=4, separators=(",", ": "), sort_keys=sort_keys)
    elif save_path.endswith(".pkl"):
        with open(save_path, "wb") as output:
            pickle.dump(obj, output, -1)
    elif save_path.endswith(".yaml"):
        with open(save_path, "w") as output:
            yaml.dump(obj, output, default_flow_style=False, allow_unicode=True)
    else:
        raise RuntimeError(f"Unsupported saving format: {save_path}")

load_file(save_path)

Load an object of a certain format.

Source code in src/deeponto/utils/file_utils.py
49
50
51
52
53
54
55
56
57
58
59
60
61
def load_file(save_path: str):
    """Load an object of a certain format."""
    if save_path.endswith(".json"):
        with open(save_path, "r") as input:
            return json.load(input)
    elif save_path.endswith(".pkl"):
        with open(save_path, "rb") as input:
            return pickle.load(input)
    elif save_path.endswith(".yaml"):
        with open(save_path, "r") as input:
            return yaml.safe_load(input)
    else:
        raise RuntimeError(f"Unsupported loading format: {save_path}")

copy2(source, destination)

Copy a file from source to destination.

Source code in src/deeponto/utils/file_utils.py
64
65
66
67
68
69
70
def copy2(source: str, destination: str):
    """Copy a file from source to destination."""
    try:
        shutil.copy2(source, destination)
        print(f"copied successfully FROM {source} TO {destination}")
    except shutil.SameFileError:
        print(f"same file exists at {destination}")

read_table(table_file_path)

Read csv or tsv file as pandas dataframe without treating "NULL", "null", and "n/a" as an empty string.

Source code in src/deeponto/utils/file_utils.py
73
74
75
76
77
78
def read_table(table_file_path: str):
    r"""Read `csv` or `tsv` file as pandas dataframe without treating `"NULL"`, `"null"`, and `"n/a"` as an empty string."""
    # TODO: this might change with the version of pandas
    na_vals = pd.io.parsers.readers.STR_NA_VALUES.difference({"NULL", "null", "n/a"})
    sep = "\t" if table_file_path.endswith(".tsv") else ","
    return pd.read_csv(table_file_path, sep=sep, na_values=na_vals, keep_default_na=False)

read_jsonl(file_path)

Read .jsonl file (list of json) introduced in the BLINK project.

Source code in src/deeponto/utils/file_utils.py
81
82
83
84
85
86
87
88
89
90
91
92
def read_jsonl(file_path: str):
    """Read `.jsonl` file (list of json) introduced in the BLINK project."""
    results = []
    key_set = []
    with open(file_path, "r", encoding="utf-8-sig") as f:
        lines = f.readlines()
        for line in lines:
            record = json.loads(line)
            results.append(record)
            key_set += list(record.keys())
    print(f"all available keys: {set(key_set)}")
    return results

read_oaei_mappings(rdf_file)

To read mapping files in the OAEI rdf format.

Source code in src/deeponto/utils/file_utils.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def read_oaei_mappings(rdf_file: str):
    """To read mapping files in the OAEI rdf format."""
    xml_root = ET.parse(rdf_file).getroot()
    ref_mappings = []  # where relation is "="
    ignored_mappings = []  # where relation is "?"

    for elem in xml_root.iter():
        # every Cell contains a mapping of en1 -rel(some value)-> en2
        if "Cell" in elem.tag:
            en1, en2, rel, measure = None, None, None, None
            for sub_elem in elem:
                if "entity1" in sub_elem.tag:
                    en1 = list(sub_elem.attrib.values())[0]
                elif "entity2" in sub_elem.tag:
                    en2 = list(sub_elem.attrib.values())[0]
                elif "relation" in sub_elem.tag:
                    rel = sub_elem.text
                elif "measure" in sub_elem.tag:
                    measure = sub_elem.text
            row = (en1, en2, measure)
            # =: equivalent; > superset of; < subset of.
            if rel == "=" or rel == ">" or rel == "<":
                # rel.replace("&gt;", ">").replace("&lt;", "<")
                ref_mappings.append(row)
            elif rel == "?":
                ignored_mappings.append(row)
            else:
                print("Unknown Relation Warning: ", rel)

    print('#Maps ("="):', len(ref_mappings))
    print('#Maps ("?"):', len(ignored_mappings))

    return ref_mappings, ignored_mappings

run_jar(jar_command, timeout=3600)

Run jar command using subprocess.

Source code in src/deeponto/utils/file_utils.py
130
131
132
133
134
135
136
137
138
139
def run_jar(jar_command: str, timeout=3600):
    """Run jar command using subprocess."""
    print(f"Run jar command with timeout: {timeout}s.")
    proc = subprocess.Popen(jar_command.split(" "))
    try:
        _, _ = proc.communicate(timeout=timeout)
    except subprocess.TimeoutExpired:
        warnings.warn("kill the jar process as timed out")
        proc.kill()
        _, _ = proc.communicate()

Last update: February 1, 2023
Created: January 14, 2023
GitHub: @Lawhy   Personal Page: yuanhe.wiki