Processing Apache logs using a Python script

[root@localhost apache2]# cat ../apache.py
__author__ = ‘root’

#!/usr/bin/python

import re
import sys
from datetime import datetime, date, timedelta
from collections import Counter

# Define the day of interest in the Apache common log format.
try:
  daysAgo = int(sys.argv[1])
except:
  daysAgo = 1
theDay = date.today() – timedelta(daysAgo)
apacheDay = theDay.strftime(‘[%d/%b/%Y:’)

# Regex for the Apache common log format.
parts = [
    r'(?P<host>\S+)’,                   # host %h
    r’\S+’,                             # indent %l (unused)
    r'(?P<user>\S+)’,                   # user %u
    r’\[(?P<time>.+)\]’,                # time %t
    r'”(?P<request>.*)”‘,               # request “%r”
    r'(?P<status>[0-9]+)’,              # status %>s
    r'(?P<size>\S+)’,                   # size %b (careful, can be ‘-‘)
    r'”(?P<referrer>.*)”‘,              # referrer “%{Referer}i”
    r'”(?P<agent>.*)”‘,                 # user agent “%{User-agent}i”
]
pattern = re.compile(r’\s+’.join(parts)+r’\s*\Z’)

# Regex for a feed request.
feed = re.compile(r’/all-this/(\d\d\d\d/\d\d/[^/]+/)?feed/(atom/)?’)

# Change Apache log items into Python types.
def pythonized(d):
  # Clean up the request.
  d[“request”] = d[“request”].split()[1]

  # Some dashes become None.
  for k in (“user”, “referrer”, “agent”):
    if d[k] == “-“:
      d[k] = None

  # The size dash becomes 0.
  if d[“size”] == “-“:
    d[“size”] = 0
  else:
    d[“size”] = int(d[“size”])

  # Convert the timestamp into a datetime object. Accept the server’s time zone.
  time, zone = d[“time”].split()
  d[“time”] = datetime.strptime(time, “%d/%b/%Y:%H:%M:%S”)

  return d

# Is this hit a page?
def ispage(hit):
  # Failures and redirects.
  hit[“status”] = int(hit[“status”])
  if hit[“status”] < 200 or hit[“status”] >= 300:
    return False

  # Feed requests.
  if feed.search(hit[“request”]):
    return False

  # Requests that aren’t GET.
  if hit[“request”][0:3] != “GET”:
    return False

  # Images, sounds, etc.
  if hit[“request”].split()[1][-1] != ‘/’:
    return False

  # Must be a page.
  return True

# Regexes for internal and Google search referrers.
internal = re.compile(r’https?://(www\.)?leancrew\.com.*’)
google = re.compile(r’https?://(www\.)?google\..*’)

# Is the referrer interesting? Internal and Google referrers are not.
def goodref(hit):
  if hit[‘referrer’]:
    return not (google.search(hit[‘referrer’]) or
                internal.search(hit[‘referrer’]))
  else:
    return False

# Initialize.
pages = []

# Parse all the lines associated with the day of interest.
for line in sys.stdin:
  if apacheDay in line:
    m = pattern.match(line)
    hit = m.groupdict()
    if ispage(hit):
      pages.append(pythonized(hit))
    else:
      continue

# Show the top five pages and the total.
print ‘%s pages’ % theDay.strftime(“%b %d, %Y”)
pageViews = Counter(x[‘request’] for x in pages)
top5 = pageViews.most_common(5)
for p in top5:
  print ”  %5d  %s” % p[::-1]
print ”  %5d  total” % len(pages)

# Show the top five referrers.
print ”’
%s referrers”’ % theDay.strftime(“%b %d, %Y”)
referrers = Counter(x[‘referrer’] for x in pages if goodref(x) )
top5 = referrers.most_common(5)
for r in top5:
  print ”  %5d  %s” % r[::-1]

[root@localhost apache2]#
[root@localhost apache2]#
[root@localhost apache2]# cat access.log
172.24.3.49 – – [08/Jun/2015:15:25:26 +0530] “GET / HTTP/1.1” 200 3594 “-” “Mozilla/5.0”
172.24.3.49 – – [08/Jun/2015:15:25:27 +0530] “GET /nice%20ports%2C/Tri%6Eity.txt%2ebak HTTP/1.0” 404 478 “-” “-“
172.24.3.49 – – [08/Jun/2015:15:25:27 +0530] “GET / HTTP/1.0” 200 11783 “-” “-“
172.24.3.139 – – [08/Jun/2015:16:03:43 +0530] “GET /moodle/course/management.php HTTP/1.1” 303 969 “-” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:03:44 +0530] “GET /moodle/login/index.php HTTP/1.1” 200 5676 “-” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:03:49 +0530] “POST /moodle/login/index.php HTTP/1.1” 303 899 “http://172.24.3.139/moodle/login/index.php” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:03:50 +0530] “GET /moodle/login/index.php HTTP/1.1” 200 5772 “http://172.24.3.139/moodle/login/index.php” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:03:58 +0530] “GET /moodle/ HTTP/1.1” 200 6591 “http://172.24.3.139/moodle/login/index.php” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:04:02 +0530] “GET /moodle/course/view.php?id=3 HTTP/1.1” 303 899 “http://172.24.3.139/moodle/” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:04:02 +0530] “GET /moodle/login/index.php HTTP/1.1” 200 5677 “http://172.24.3.139/moodle/” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:04:04 +0530] “POST /moodle/login/index.php HTTP/1.1” 303 1093 “http://172.24.3.139/moodle/login/index.php” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:04:04 +0530] “GET /moodle/login/index.php?testsession=1 HTTP/1.1” 303 909 “http://172.24.3.139/moodle/login/index.php” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:04:04 +0530] “GET /moodle/course/view.php?id=3 HTTP/1.1” 303 909 “http://172.24.3.139/moodle/login/index.php” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:04:04 +0530] “GET /moodle/enrol/index.php?id=3 HTTP/1.1” 200 6123 “http://172.24.3.139/moodle/login/index.php” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:04:07 +0530] “GET /moodle/course/view.php?id=3 HTTP/1.1” 303 909 “http://172.24.3.139/moodle/enrol/index.php?id=3” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:04:07 +0530] “GET /moodle/enrol/index.php?id=3 HTTP/1.1” 200 6122 “http://172.24.3.139/moodle/enrol/index.php?id=3” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:04:14 +0530] “GET /moodle/course/view.php?id=3 HTTP/1.1” 303 910 “http://172.24.3.139/moodle/enrol/index.php?id=3” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:04:14 +0530] “GET /moodle/enrol/index.php?id=3 HTTP/1.1” 200 6122 “http://172.24.3.139/moodle/enrol/index.php?id=3” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:04:19 +0530] “GET /moodle/ HTTP/1.1” 200 6707 “http://172.24.3.139/moodle/enrol/index.php?id=3” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:08:11 +0530] “GET /moodle/mod/vpl/forms/edit.php?id=2 HTTP/1.1” 303 969 “-” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:08:11 +0530] “GET /moodle/login/index.php HTTP/1.1” 200 5747 “-” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:08:11 +0530] “GET /moodle/theme/yui_combo.php?rollup/3.17.2/yui-moodlesimple-min.css HTTP/1.1” 200 1561 “http://172.24.3.139/moodle/login/index.php” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:08:11 +0530] “GET /moodle/theme/yui_combo.php?rollup/3.17.2/yui-moodlesimple-min.js&rollup/1432199228/mcore-min.js HTTP/1.1” 200 89235 “http://172.24.3.139/moodle/login/index.php” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:08:12 +0530] “GET /moodle/theme/javascript.php/clean/1432199228/footer HTTP/1.1” 200 1092 “http://172.24.3.139/moodle/login/index.php” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:08:12 +0530] “GET /moodle/theme/image.php/clean/core/1432199228/help HTTP/1.1” 200 1615 “http://172.24.3.139/moodle/login/index.php” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:08:12 +0530] “GET /moodle/lib/javascript.php/1432199228/lib/javascript-static.js HTTP/1.1” 200 10600 “http://172.24.3.139/moodle/login/index.php” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:08:12 +0530] “GET /moodle/theme/styles.php/clean/1432199228/all HTTP/1.1” 200 86173 “http://172.24.3.139/moodle/login/index.php” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:08:12 +0530] “GET /moodle/theme/yui_combo.php?m/1432199228/theme_bootstrapbase/bootstrap/bootstrap-min.js HTTP/1.1” 200 2464 “http://172.24.3.139/moodle/login/index.php” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:08:12 +0530] “GET /moodle/theme/yui_combo.php?3.17.2/cssbutton/cssbutton-min.css HTTP/1.1” 200 1632 “http://172.24.3.139/moodle/login/index.php” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:08:12 +0530] “GET /moodle/theme/yui_combo.php?3.17.2/plugin/plugin-min.js&m/1432199228/core/lockscroll/lockscroll-min.js HTTP/1.1” 200 1529 “http://172.24.3.139/moodle/login/index.php” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [08/Jun/2015:16:08:12 +0530] “GET /moodle/theme/yui_combo.php?3.17.2/event-mousewheel/event-mousewheel-min.js&3.17.2/event-resize/event-resize-min.js&3.17.2/event-hover/event-hover-min.js&3.17.2/event-touch/event-touch-min.js&3.17.2/event-move/event-move-min.js&3.17.2/event-flick/event-flick-min.js&3.17.2/event-valuechange/event-valuechange-min.js&3.17.2/event-tap/event-tap-min.js HTTP/1.1” 200 5348 “http://172.24.3.139/moodle/login/index.php” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
127.0.0.1 – – [09/Jun/2015:11:51:17 +0530] “GET / HTTP/1.1” 200 3594 “-” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
127.0.0.1 – – [09/Jun/2015:11:51:17 +0530] “GET /icons/ubuntu-logo.png HTTP/1.1” 200 3688 “http://localhost/” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
127.0.0.1 – – [09/Jun/2015:11:51:17 +0530] “GET /favicon.ico HTTP/1.1” 404 498 “-” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
127.0.0.1 – – [09/Jun/2015:11:51:35 +0530] “GET /? HTTP/1.1” 200 3594 “-” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
127.0.0.1 – – [09/Jun/2015:11:51:41 +0530] “GET /moodle/ HTTP/1.1” 200 823 “-” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [09/Jun/2015:11:51:44 +0530] “GET /moodle/ HTTP/1.1” 200 6661 “-” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [09/Jun/2015:11:51:44 +0530] “GET /moodle/ HTTP/1.1” 200 6661 “http://localhost/moodle/” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
172.24.3.139 – – [09/Jun/2015:11:51:53 +0530] “GET /moodle/ HTTP/1.1” 200 6591 “-” “Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0”
[root@localhost apache2]#
[root@localhost apache2]#
[root@localhost apache2]#
[root@localhost apache2]# python ../apache.py < access.log
Jun 08, 2015 pages
      2  /moodle/
      2  /
      4  total

Jun 08, 2015 referrers
      1  http://172.24.3.139/moodle/login/index.php
      1  http://172.24.3.139/moodle/enrol/index.php?id=3
[root@localhost apache2]#

  • Ask Question