-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcompile_ilab_qa.py
executable file
·96 lines (85 loc) · 3.25 KB
/
compile_ilab_qa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/bin/env python3
# just 'python' doesn't work in container environment -_^
# We are using ilab 0.18 schema, version 3
import json
import textwrap
import argparse
import yaml
import glob
import os
data = {
"version": 3,
"created_by": "Log Detective Team",
"domain": "software",
# dropped in version 3
# "task_description": "Annotated snippets from software logs that explain problems",
"seed_examples": [],
"document_outline": "Building RPMs",
"document": {
"repo": "https://github.com/fedora-copr/logdetective-taxonomy",
"commit": "HEAD",
"patterns": ["README.md"]
}
}
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
'-s',
'--source-docs',
type=str,
default='./results/results',
help="First argument (PATH) is a path to directory tree of unpacked"
" results from the https://logdetective.com/download website."
)
parser.add_argument(
'-d',
'--knowledge-target',
type=str,
default='./knowledge/technology/qna.yaml')
args = parser.parse_args()
raw = []
# ilab doesn't allow duplicate entries, so we need to make our entries unique
haz_snippets = set()
for file in glob.glob(f"{args.source_docs}/**/*.json", recursive=True):
with open(file) as f:
raw.append(json.load(f))
wrapper_snippets = textwrap.TextWrapper(
width=112, replace_whitespace=False, break_long_words=False,
drop_whitespace=False, break_on_hyphens=False
)
wrapper_text = textwrap.TextWrapper(width=112)
for e in raw:
for k, v in e['logs'].items():
for s in v['snippets']:
snippet = v['content'][s['start_index']:s['end_index']]
if len(snippet) > 150:
# too big, we'll figure it out later
continue
# 120 is the instructlab limit for a yaml line
# 112 = 7 spaces for padding, 112 the log line, 1 = EOL
# since snippet is the log chunk, we wanna be as strict as possible on the wrapping
snippet = wrapper_snippets.fill(snippet).strip()
if not snippet:
continue
if snippet in haz_snippets:
continue
data["seed_examples"].append({
"context": snippet,
"questions_and_answers": [{
"question": "Explain log snippets from an RPM build.",
"answer": wrapper_text.fill(s["user_comment"])
}, {
"question": "How can I resolve the issue?",
"answer": wrapper_text.fill(e["how_to_fix"])
}, {
"question": "What is the reason the build has failed?",
"answer": wrapper_text.fill(e["fail_reason"])
}]
})
haz_snippets.add(snippet)
if not os.path.exists(os.path.dirname(args.knowledge_target)):
os.makedirs(os.path.dirname(args.knowledge_target))
with open(args.knowledge_target, 'w') as f:
f.write(yaml.dump(data, default_style="|"))
if __name__ == '__main__':
main()