# 09.日志字段快速抽取
经常会遇到把日志输出的结构化数据做抽取和分析。提供一个简单脚本可以快出抽取json的某个字段。如果是数字,可以统计分位值。
用法
- python3 log_json.py -i ~/data/final_279618863.txt -f ".sessionId,.message.statistics.firstTokenInMillisecond,.message.statistics.measurements.SEND_TO_CLIENT[-1:]" -p 1 2
JSON PATH 语法可以参考JSONPath - XPath for JSON (opens new window)
输出
['f510cfa632a54d5a85a04b60259068e6', 268, 843]
['b9947420dacf497fa5b7bde9f55c13a4', 481, 993]
['72f9c0c921fb49f4a2d40c9713056f5a', 630, 1157]
['39c5584fa5ac46e29147b091fec73108', 435, 3471]
2.统计信息
.message.statistics.firstTokenInMillisecond,.message.statistics.measurements.SEND_TO_CLIENT[-1:]
p50: [ 432 1204]
p95: [ 881 5387]
p99: [1117 8089]
1
2
3
4
5
6
7
8
9
2
3
4
5
6
7
8
9
import json
import logging
from jsonpath import JSONPath
import os
import argparse
import numpy as np
'''
快速输出日志中对应的字段
dependency:
pip install jsonpath-python
'''
def log_extract(input, fields: [], percentile_index: []):
if not os.path.exists(input):
logging.error("input file should be existed,input:{}".format(input))
return
if len(fields) <= 0:
logging.error("filed list should be non-empty")
return
result = []
percentile_result = []
with open(input, mode="rb") as f:
lines = map(lambda s: s.decode(encoding="utf-8", errors="ignore"), f.readlines())
index = 0
for line in lines:
log_output = convert_to_dict(line)
if not log_output:
logging.error("convert to json failed, row:{}".format(index))
continue
# output the expect fields
item = []
for f in fields:
val = JSONPath('$' + f).parse(log_output)
t = val[0]
item.append(t)
percentile_cols = [item[x] for x in percentile_index if x < len(item)]
percentile_result.append(percentile_cols)
result.append(item)
index = index + 1
# analysis
print("1.************************************************")
print(",".join(fields))
for item in result:
print(item)
if percentile_index:
print("2.统计信息")
percentile_cols = [fields[x] for x in percentile_index if x < len(fields)]
print(",".join(percentile_cols))
percentile_result_np = np.array(percentile_result)
p50 = np.quantile(percentile_result_np, 0.5, axis=0, interpolation="nearest")
p95 = np.quantile(percentile_result_np, 0.95, axis=0, interpolation="nearest")
p99 = np.quantile(percentile_result_np, 0.99, axis=0, interpolation="nearest")
print("p50:\t{}".format(p50))
print("p95:\t{}".format(p95))
print("p99:\t{}".format(p99))
def convert_to_dict(s):
if not s:
return None
try:
return json.loads(str(s).strip())
except Exception as e:
logging.error(e.args)
return None
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", required=True, type=str, help="the input json file")
parser.add_argument("-f", "--field", required=True, type=str, help="the output fields")
parser.add_argument("-p", "--percentile", required=False, nargs="+", type=int, help="the percentile column index")
args = parser.parse_args()
if __name__ == "__main__":
# input = r'C:\Users\xw80329\Documents\final_279618863.txt'
# log_extract(input, ['.sessionId', '.message.statistics.firstTokenInMillisecond', '.message.statistics.measurements.SEND_TO_CLIENT[-1:]'])
print("args:", args.input, args.field, args.percentile)
fields = str(args.field).split(",")
log_extract(args.input, fields, args.percentile or [])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
Apache License 2.0 | Copyright © 2022 by xueliang.wu 苏ICP备15016087号