# 09.日志字段快速抽取

经常会遇到把日志输出的结构化数据做抽取和分析。提供一个简单脚本可以快出抽取json的某个字段。如果是数字，可以统计分位值。

用法

python3 log_json.py -i ~/data/final_279618863.txt -f ".sessionId,.message.statistics.firstTokenInMillisecond,.message.statistics.measurements.SEND_TO_CLIENT[-1:]" -p 1 2

JSON PATH 语法可以参考JSONPath - XPath for JSON (opens new window)

输出

['f510cfa632a54d5a85a04b60259068e6', 268, 843]
['b9947420dacf497fa5b7bde9f55c13a4', 481, 993]
['72f9c0c921fb49f4a2d40c9713056f5a', 630, 1157]
['39c5584fa5ac46e29147b091fec73108', 435, 3471]
2.统计信息
.message.statistics.firstTokenInMillisecond,.message.statistics.measurements.SEND_TO_CLIENT[-1:]
p50:    [ 432 1204]
p95:    [ 881 5387]
p99:    [1117 8089]

1
2
3
4
5
6
7
8
9

import json
import logging
from jsonpath import JSONPath
import os
import argparse
import numpy as np

'''
快速输出日志中对应的字段
dependency:
pip install jsonpath-python

'''


def log_extract(input, fields: [], percentile_index: []):
    if not os.path.exists(input):
        logging.error("input file should be existed,input:{}".format(input))
        return

    if len(fields) <= 0:
        logging.error("filed list should be non-empty")
        return

    result = []
    percentile_result = []
    with open(input, mode="rb") as f:
        lines = map(lambda s: s.decode(encoding="utf-8", errors="ignore"), f.readlines())
        index = 0
        for line in lines:
            log_output = convert_to_dict(line)
            if not log_output:
                logging.error("convert to json failed, row:{}".format(index))
                continue

            # output the expect fields
            item = []
            for f in fields:
                val = JSONPath('$' + f).parse(log_output)
                t = val[0]
                item.append(t)

            percentile_cols = [item[x] for x in percentile_index if x < len(item)]
            percentile_result.append(percentile_cols)
            result.append(item)
            index = index + 1

    # analysis
    print("1.************************************************")
    print(",".join(fields))
    for item in result:
        print(item)

    if percentile_index:
        print("2.统计信息")
        percentile_cols = [fields[x] for x in percentile_index if x < len(fields)]
        print(",".join(percentile_cols))
        percentile_result_np = np.array(percentile_result)
        p50 = np.quantile(percentile_result_np, 0.5, axis=0, interpolation="nearest")
        p95 = np.quantile(percentile_result_np, 0.95, axis=0, interpolation="nearest")
        p99 = np.quantile(percentile_result_np, 0.99, axis=0, interpolation="nearest")
        print("p50:\t{}".format(p50))
        print("p95:\t{}".format(p95))
        print("p99:\t{}".format(p99))


def convert_to_dict(s):
    if not s:
        return None

    try:
        return json.loads(str(s).strip())
    except Exception as e:
        logging.error(e.args)

    return None


parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", required=True, type=str, help="the input json file")
parser.add_argument("-f", "--field", required=True, type=str, help="the output fields")
parser.add_argument("-p", "--percentile", required=False, nargs="+", type=int, help="the percentile column index")

args = parser.parse_args()

if __name__ == "__main__":
    # input = r'C:\Users\xw80329\Documents\final_279618863.txt'
    # log_extract(input, ['.sessionId', '.message.statistics.firstTokenInMillisecond', '.message.statistics.measurements.SEND_TO_CLIENT[-1:]'])
    print("args:", args.input, args.field, args.percentile)
    fields = str(args.field).split(",")

    log_extract(args.input, fields, args.percentile or [])

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92

← 08.Pandas模块 10.Streamlit组件介绍 →