167 lines
5.7 KiB
167 lines
5.7 KiB
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import re
import fractions
from collections import namedtuple
LabelScore = namedtuple('LabelScore', 'match model ref precision recall f1')
class TrainLogParser(object):
def __init__(self):
self.state = None
self.featgen_percent = -2
self.featgen_num_features = None
self.featgen_seconds = None
self.training_seconds = None
self.storing_seconds = None
self.iterations = []
self.last_iteration = None
self.log = []
self.events = []
def feed(self, line):
# if line != '\n':
if self.state is None:
self.state = 'STARTING'
self.events.append(('start', 0, len(self.log)))
return 'start'
event = getattr(self, "handle_" + self.state)(line)
if event is not None:
start, end = self.events[-1][2], len(self.log)
if event in ('prepared', 'optimization_end'):
end -= 1
self.events.append((event, start, end))
return event
def last_log(self):
event, start, end = self.events[-1]
return ''.join(self.log[start:end])
def handle_STARTING(self, line):
if line.startswith('Feature generation'):
self.state = 'FEATGEN'
def handle_FEATGEN(self, line):
if line in "0123456789.10":
self.featgen_percent += 2
return 'featgen_progress'
m = re.match(r"Number of features: (\d+)", line)
if m:
self.featgen_num_features = int(m.group(1))
return None
if self._seconds(line) is not None:
self.featgen_seconds = self._seconds(line)
self.state = 'AFTER_FEATGEN'
return 'featgen_end'
def handle_AFTER_FEATGEN(self, line):
if self._iteration_head(line) is not None:
self.state = 'ITERATION'
return 'prepared'
if 'terminated with error' in line:
self.state = 'AFTER_ITERATION'
return 'prepare_error'
def handle_ITERATION(self, line):
if self._iteration_head(line) is not None:
self.last_iteration = {
'num': self._iteration_head(line),
'scores': {},
elif line == '\n':
self.state = 'AFTER_ITERATION'
return 'iteration'
def add_re(key, pattern, typ):
m = re.match(pattern, line)
if m:
self.last_iteration[key] = typ(m.group(1))
add_re("loss", r"Loss: (\d+\.\d+)", float)
add_re("feature_norm", r"Feature norm: (\d+\.\d+)", float)
add_re("error_norm", r"Error norm: (\d+\.\d+)", float)
add_re("active_features", r"Active features: (\d+)", int)
add_re("linesearch_trials", r"Line search trials: (\d+)", int)
add_re("linesearch_step", r"Line search step: (\d+\.\d+)", float)
add_re("time", r"Seconds required for this iteration: (\d+\.\d+)", float)
m = re.match(r"Macro-average precision, recall, F1: \((\d\.\d+), (\d\.\d+), (\d\.\d+)\)", line)
if m:
self.last_iteration['avg_precision'] = float(m.group(1))
self.last_iteration['avg_recall'] = float(m.group(2))
self.last_iteration['avg_f1'] = float(m.group(3))
m = re.match(r"Item accuracy: (\d+) / (\d+)", line)
if m:
acc = fractions.Fraction(int(m.group(1)), int(m.group(2)))
self.last_iteration['item_accuracy'] = acc
self.last_iteration['item_accuracy_float'] = float(acc)
m = re.match(r"Instance accuracy: (\d+) / (\d+)", line)
if m:
acc = fractions.Fraction(int(m.group(1)), int(m.group(2)))
self.last_iteration['instance_accuracy'] = acc
self.last_iteration['instance_accuracy_float'] = float(acc)
m = re.match(r"\s{4}(.+): \((\d+), (\d+), (\d+)\) \((\d\.\d+), (\d\.\d+), (\d\.\d+)\)", line)
if m:
self.last_iteration['scores'][m.group(1)] = LabelScore(**{
'match': int(m.group(2)),
'model': int(m.group(3)),
'ref': int(m.group(4)),
'precision': float(m.group(5)),
'recall': float(m.group(6)),
'f1': float(m.group(7)),
m = re.match(r"\s{4}(.+): \(0, 0, 0\) \(\*{6}, \*{6}, \*{6}\)", line)
if m:
self.last_iteration['scores'][m.group(1)] = LabelScore(**{
'match': 0,
'model': 0,
'ref': 0,
'precision': None,
'recall': None,
'f1': None,
def handle_AFTER_ITERATION(self, line):
if self._iteration_head(line) is not None:
self.state = 'ITERATION'
return self.handle_ITERATION(line)
m = re.match(r"Total seconds required for training: (\d+\.\d+)", line)
if m:
self.training_seconds = float(m.group(1))
if line.startswith('Storing the model'):
self.state = 'STORING'
return 'optimization_end'
def handle_STORING(self, line):
if line == '\n':
return 'end'
elif self._seconds(line):
self.storing_seconds = self._seconds(line)
def _iteration_head(self, line):
m = re.match(r'\*{5} (?:Iteration|Epoch) #(\d+) \*{5}\n', line)
if m:
return int(m.group(1))
def _seconds(self, line):
m = re.match(r'Seconds required: (\d+\.\d+)', line)
if m:
return float(m.group(1))