Skip to content

Commit

Permalink
Convert field names to enum and implement exclusion for Caddy logs
Browse files Browse the repository at this point in the history
  • Loading branch information
DavidVentura committed Mar 31, 2024
1 parent 35761d4 commit cf8c8e3
Show file tree
Hide file tree
Showing 3 changed files with 167 additions and 89 deletions.
64 changes: 63 additions & 1 deletion logscan/caddy_parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@ import (
"fmt"
"math"
"net/url"
"strings"
"time"

"github.com/bmatcuk/doublestar/v4"
)

// https://caddyserver.com/docs/caddyfile/directives/log
Expand Down Expand Up @@ -36,7 +39,8 @@ type Headers struct {
}

type CaddyParser struct {
datetime string
datetime string
excludePatterns []excludePattern
}

func (p CaddyParser) Parse(line string) (Line, bool, error) {
Expand All @@ -47,6 +51,11 @@ func (p CaddyParser) Parse(line string) (Line, bool, error) {
return nil, false, err
}

for _, e := range p.excludePatterns {
if logEntry.matchesPattern(e) {
return nil, true, nil
}
}
return logEntry, false, nil
}

Expand Down Expand Up @@ -153,3 +162,56 @@ func (l CaddyLogEntry) Language() string {
}
return ""
}

func (l CaddyLogEntry) fieldValue(name string) string {
switch name {
default:
panic(fmt.Sprintf("Received invalid field request: %s", name))
case fieldUserAgent:
return l.UserAgent()
case fieldHost:
return l.Host()
case fieldRemoteAddr:
return l.RemoteAddr()
case fieldAcceptLanguage:
return l.Language()
case fieldContentType:
return l.ContentType()
case fieldHttp:
return l.HTTP()
case fieldMethod:
return l.Method()
case fieldPath:
return l.Path()
case fieldQuery:
return l.Query()
case fieldReferrer:
return l.Referrer()
case fieldSize:
return fmt.Sprint(l.Size())
case fieldStatus:
return fmt.Sprint(l.Status())
case fieldXff:
return l.XForwardedFor()
}
}

func (l CaddyLogEntry) matchesPattern(e excludePattern) bool {
var m bool
fieldValue := l.fieldValue(e.field)
switch e.kind {
default:
m = strings.Contains(fieldValue, e.pattern)
case excludeGlob:
// We use doublestar instead of filepath.Match() because the latter
// doesn't support "**" and "{a,b}" patterns, both of which are very
// useful here.
m, _ = doublestar.Match(e.pattern, fieldValue)
case excludeRe:
m = e.re.MatchString(fieldValue)
}
if e.negate {
return !m
}
return m
}
83 changes: 79 additions & 4 deletions logscan/logscan.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,38 @@ import (
"fmt"
"io"
"regexp"
"slices"
"strings"
"time"

"github.com/bmatcuk/doublestar/v4"
"zgo.at/errors"
"zgo.at/follow"
"zgo.at/zlog"
)

var reFormat = regexp.MustCompile(`\\\$[\w-_]+`)

var fields = []string{"ignore", "time", "date", "datetime", "remote_addr",
"xff", "method", "status", "http", "path", "query", "referrer",
"user_agent", "host", "content_type", "timing_sec", "timing_milli",
"timing_micro", "size"}
const (
fieldAcceptLanguage = "accept_language"
fieldContentType = "content_type"
fieldHost = "host"
fieldHttp = "http"
fieldMethod = "method"
fieldPath = "path"
fieldQuery = "query"
fieldReferrer = "referrer"
fieldRemoteAddr = "remote_addr"
fieldSize = "size"
fieldStatus = "status"
fieldUserAgent = "user_agent"
fieldXff = "xff"
)

var fields = []string{"ignore", "time", "date", "datetime", fieldRemoteAddr,
fieldXff, fieldMethod, fieldStatus, fieldHttp, fieldPath, fieldQuery, fieldReferrer,
fieldUserAgent, fieldHost, fieldContentType, "timing_sec", "timing_milli",
"timing_micro", fieldSize}

const (
excludeContains = 0
Expand Down Expand Up @@ -183,3 +201,60 @@ start:
func (s *Scanner) Datetime(l Line) (time.Time, error) {
return l.Datetime(s.lp)
}

type excludePattern struct {
kind int // exclude* constant
negate bool // ! present
field string // "path", "content_type"
pattern string // ".gif", "*.gif"
re *regexp.Regexp // only if kind=excludeRe
}

func processExcludes(exclude []string) ([]excludePattern, error) {
// "static" needs to expand to two values.
for i, e := range exclude {
switch e {
case "static":
// Note: maybe check if using glob patterns is faster?
exclude[i] = `path:re:.*\.(:?js|css|gif|jpe?g|png|svg|ico|web[mp]|mp[34])$`
exclude = append(exclude, `content_type:re:^(?:text/(?:css|javascript)|image/(?:png|gif|jpeg|svg\+xml|webp)).*?`)
case "html":
exclude[i] = "content_type:^text/html.*?"
case "redirect":
exclude[i] = "status:glob:30[0123]"
}
}

patterns := make([]excludePattern, 0, len(exclude))
for _, e := range exclude {
var p excludePattern
if strings.HasPrefix(e, "!") {
p.negate = true
e = e[1:]
}

p.field, p.pattern, _ = strings.Cut(e, ":")
if !slices.Contains(fields, p.field) {
return nil, fmt.Errorf("invalid field %q in exclude pattern %q", p.field, e)
}
if p.pattern == "" {
return nil, fmt.Errorf("no pattern in %q", e)
}

var err error
switch {
case strings.HasPrefix(p.pattern, "glob:"):
p.kind, p.pattern = excludeGlob, p.pattern[5:]
_, err = doublestar.Match(p.pattern, "")
case strings.HasPrefix(p.pattern, "re:"):
p.kind, p.pattern = excludeRe, p.pattern[3:]
p.re, err = regexp.Compile(p.pattern)
}
if err != nil {
return nil, fmt.Errorf("invalid exclude pattern: %q: %w", e, err)
}
patterns = append(patterns, p)
}

return patterns, nil
}
109 changes: 25 additions & 84 deletions logscan/regex_parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package logscan
import (
"fmt"
"regexp"
"slices"
"strconv"
"strings"
"time"
Expand Down Expand Up @@ -33,7 +32,7 @@ func (p RegexParser) Parse(line string) (Line, bool, error) {
}
}
for _, e := range p.exclude {
if parsed.exclude(e) {
if parsed.matchesPattern(e) {
return nil, true, nil
}
}
Expand Down Expand Up @@ -93,30 +92,29 @@ func newRegexParser(format, date, tyme, datetime string, exclude []string) (*Reg
}
}

case "host":
case fieldHost:
p = `(?:xn--)?[a-zA-Z0-9.-]+`
case "remote_addr":
case fieldRemoteAddr:
p = `[0-9a-fA-F:.]+`
case "xff":
case fieldXff:
p = `[0-9a-fA-F:. ,]+`

case "method":
case fieldMethod:
p = `[A-Z]{3,10}`
case "status":
case fieldStatus:
p = `\d{3}`
case "http":
case fieldHttp:
p = `HTTP/[\d.]+`
case "path":
case fieldPath:
p = `/.*?`
case "timing_sec":
p = `[\d.]+`
case "timing_milli", "timing_micro":
p = `\d+`
case "size":
case fieldSize:
p = `(?:\d+|-)`
case "referrer", "user_agent":
case fieldReferrer, fieldUserAgent:
p = `.*?`
case "query", "content_type":
case fieldQuery, fieldContentType:
// Default
}
return "(?P<" + m + ">" + p + ")"
Expand All @@ -135,78 +133,21 @@ func newRegexParser(format, date, tyme, datetime string, exclude []string) (*Reg
}, nil
}

type excludePattern struct {
kind int // exclude* constant
negate bool // ! present
field string // "path", "content_type"
pattern string // ".gif", "*.gif"
re *regexp.Regexp // only if kind=excludeRe
}

func processExcludes(exclude []string) ([]excludePattern, error) {
// "static" needs to expand to two values.
for i, e := range exclude {
switch e {
case "static":
// Note: maybe check if using glob patterns is faster?
exclude[i] = `path:re:.*\.(:?js|css|gif|jpe?g|png|svg|ico|web[mp]|mp[34])$`
exclude = append(exclude, `content_type:re:^(?:text/(?:css|javascript)|image/(?:png|gif|jpeg|svg\+xml|webp)).*?`)
case "html":
exclude[i] = "content_type:^text/html.*?"
case "redirect":
exclude[i] = "status:glob:30[0123]"
}
}

patterns := make([]excludePattern, 0, len(exclude))
for _, e := range exclude {
var p excludePattern
if strings.HasPrefix(e, "!") {
p.negate = true
e = e[1:]
}

p.field, p.pattern, _ = strings.Cut(e, ":")
if !slices.Contains(fields, p.field) {
return nil, fmt.Errorf("invalid field %q in exclude pattern %q", p.field, e)
}
if p.pattern == "" {
return nil, fmt.Errorf("no pattern in %q", e)
}

var err error
switch {
case strings.HasPrefix(p.pattern, "glob:"):
p.kind, p.pattern = excludeGlob, p.pattern[5:]
_, err = doublestar.Match(p.pattern, "")
case strings.HasPrefix(p.pattern, "re:"):
p.kind, p.pattern = excludeRe, p.pattern[3:]
p.re, err = regexp.Compile(p.pattern)
}
if err != nil {
return nil, fmt.Errorf("invalid exclude pattern: %q: %w", e, err)
}
patterns = append(patterns, p)
}

return patterns, nil
}

type RegexLine map[string]string

func (l RegexLine) Host() string { return l["host"] }
func (l RegexLine) RemoteAddr() string { return l["remote_addr"] }
func (l RegexLine) XForwardedFor() string { return l["xff"] }
func (l RegexLine) Method() string { return l["method"] }
func (l RegexLine) HTTP() string { return l["http"] }
func (l RegexLine) Path() string { return l["path"] }
func (l RegexLine) Query() string { return l["query"] }
func (l RegexLine) Referrer() string { return l["referrer"] }
func (l RegexLine) UserAgent() string { return l["user_agent"] }
func (l RegexLine) ContentType() string { return l["content_type"] }
func (l RegexLine) Status() int { return toI(l["status"]) }
func (l RegexLine) Size() int { return toI(l["size"]) }
func (l RegexLine) Language() string { return l["accept_language"] }
func (l RegexLine) Host() string { return l[fieldHost] }
func (l RegexLine) RemoteAddr() string { return l[fieldRemoteAddr] }
func (l RegexLine) XForwardedFor() string { return l[fieldXff] }
func (l RegexLine) Method() string { return l[fieldMethod] }
func (l RegexLine) HTTP() string { return l[fieldHttp] }
func (l RegexLine) Path() string { return l[fieldPath] }
func (l RegexLine) Query() string { return l[fieldQuery] }
func (l RegexLine) Referrer() string { return l[fieldReferrer] }
func (l RegexLine) UserAgent() string { return l[fieldUserAgent] }
func (l RegexLine) ContentType() string { return l[fieldContentType] }
func (l RegexLine) Status() int { return toI(l[fieldStatus]) }
func (l RegexLine) Size() int { return toI(l[fieldSize]) }
func (l RegexLine) Language() string { return l[fieldAcceptLanguage] }

func (l RegexLine) Timing() time.Duration {
s, ok := l["timing_sec"]
Expand Down Expand Up @@ -259,7 +200,7 @@ func toUi64(s string) uint64 {

var _ Line = RegexLine{}

func (l RegexLine) exclude(e excludePattern) bool {
func (l RegexLine) matchesPattern(e excludePattern) bool {
var m bool
switch e.kind {
default:
Expand Down

0 comments on commit cf8c8e3

Please sign in to comment.