Table des matières

Archiver les logs de squid dans une base MySQL

Par défaut, squid envoi tous ses journaux dans des fichiers. Si c'est pratique pour suivre en temps réel qui accède à quoi, c'est déjà beaucoup plus dur de faire des analyse plus fines, par exemple:

Il existe beaucoup d'outils d'analyse des journaux de squid (le plus connu est probablement sarg), mais s'ils permettent d'avoir des statistiques intéressantes, ils n'aident pas vraiment pour les exemples ci-dessus. Pour ça, le mieux est de loguer toutes les requêtes qui passent par squid dans une base de données. Certaines versions de squid ont une directive nommée logfile_daemon, qui permet d'envoyer les journaux à un démon, qui pourra lui même enregistrer tout ça dans une base (voir par exemple http://sourceforge.net/projects/logmysqldaemon/). Mais (il y a toujours un mais), il y a deux problèmes majeurs avec cette solution:

Voilà donc la solution que nous avons mis au point: squid inscrit ses journaux dans des fichiers texte classiques, et un démon indépendant les surveille en temps réel, et inscrit tout ça dans une base de donnée (pour nous, c'est du MySQL, mais n'importe quelle base compatible avec le module perl DBI devrait fonctionner). En plus des journaux squid, cette solution est capable d'archiver les journaux de squidGuard.

Schéma MySQL

Il faut tout d'abord créer une base de donnée et lui injecter ce schéma:

squid_log.sql
-- Table to store squid access logs
 
CREATE TABLE access_log (
    id INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY,
    proxy_host           VARCHAR(30),
    TIMESTAMP            DECIMAL(15,3),
    date_day             DATE,                  -- set by trigger
    date_time            TIME,                  -- set by trigger
    response_time        INTEGER,
    client_ip            CHAR(15),
    squid_status         VARCHAR(30),
    http_status          VARCHAR(10),
    reply_size           INTEGER,
    request_method       VARCHAR(15),
    url                  VARCHAR(200),
    DOMAIN               VARCHAR(50),
    username             VARCHAR(30),
    squid_connect        VARCHAR(20),
    server_ip            CHAR(15),
    mime_type            VARCHAR(50)
) ENGINE=MYISAM;
 
-- trigger that extracts the date value from the timestamp column
-- and stores it in the date_day and date_time columns
-- this allows fast calculation of per-day aggregate values
DELIMITER //
CREATE TRIGGER extract_date_bi BEFORE INSERT ON access_log FOR EACH ROW
BEGIN
    SET NEW.date_day  = DATE(FROM_UNIXTIME(NEW.timestamp));
    SET NEW.date_time = TIME(FROM_UNIXTIME(NEW.timestamp));
END; //
 
CREATE TABLE deny_log (
    id INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY,
    proxy_host           VARCHAR(30),
    date_day             DATE,
    date_time            TIME,
    category             VARCHAR(40),
    client_ip            CHAR(15),
    url                  VARCHAR(200),
    DOMAIN               VARCHAR(50),
    username             VARCHAR(30)
) ENGINE=MYISAM;

Script de journalisation

Il suffit ensuite de lancer ce script:

squid-db-logd.pl
#!/usr/bin/perl -w
 
use File::Tail;
use DBI;
use URI;
use Getopt::Long;
use threads;
use threads::shared;
use strict;
 
our %opts = ();
 
# Set default options
$opts{squidlog} = '/var/log/squid/access.log';
$opts{squidguardlog} = '/var/log/squidGuard/deny.log';
$opts{squid} = 1;
$opts{squidguard} = 0;
$opts{debug} = 0;
$opts{dbhost} = 'localhost';
$opts{dbname} = 'squid_log';
$opts{dbuser} = 'squid';
$opts{dbpass} = 'squid';
 
# get command line arguments
GetOptions(
    "debug=i"            => \$opts{debug},
    "squidlog=s"         => \$opts{squidlog},
    "squidguardlog=s"    => \$opts{squidguardlog},
    "squid!"             => \$opts{squid},
    "squidguard!"        => \$opts{squidguard},
    "dbhost=s"           => \$opts{dbhost},
    "dbname=s"           => \$opts{dbname},
    "dbpass=s"           => \$opts{dbpass}
);
 
# Disable output buffering
select(STDOUT);
$| = 1;
select(STDERR);
$| = 1;
 
open STDERR, '>&STDOUT';
 
# Set process name
$0 = 'squid-db-logd';
 
# Get hostname
our $host = `hostname`;
chomp($host);
 
### Subroutines
 
# Print messages on stderr
# for debuging purpose
sub printlog {
    my $msg = shift;
    print "$msg\n";
    return;
}
 
# Connect to the database
sub db_connect {
    my $dbh = DBI->connect("DBI:mysql:database=$opts{dbname};host=$opts{dbhost}",
                  $opts{dbuser}, $opts{dbpass}, {RaiseError => 1});
    die "Couldn't connect to database\n" unless ($dbh);
    return $dbh;
}
 
# escape chars for MySQL queries
sub mysql_escape {
    my $string = shift;
    $string =~ s|'|\\'|g;
    return $string;
}
 
# log squid access
sub squid_log {
 
    my $logfile = shift;
 
    printlog("squid_log thread connecting to database") if ($opts{debug} ge 1);
    my $dbh = db_connect;
    # Open log file
 
    printlog("opening squid log file") if ($opts{debug} ge 1);
    my $tail = File::Tail->new(name=>$logfile, maxinterval=>15);
 
    while (defined(my $line=$tail->read)){
        my ($timestamp, $response_time, $client_ip, $status, $reply_size,
            $request_method, $url, $username, $server, $mime_type) = split /\s+/, $line;
        my ($squid_status, $http_status) = split /\//, $status;
        my ($squid_connect, $server_ip) = split /\//, $server;
 
        # Skip stats requested by squidclient
        next if ($url =~ m/^cache_object:/);
 
        my $domain;
        if ($request_method eq 'CONNECT'){
            ($domain,undef) = split /:/, $url;
        }
        else{
            my $uri = URI->new($url);
            $domain = mysql_escape($uri->host);
        }
 
        # MySQL escape
        # Shouldn't be needed, but just in case logs contains junk
        $timestamp      = mysql_escape($timestamp);
        $response_time  = mysql_escape($response_time);
        $client_ip      = mysql_escape($client_ip);
        $squid_status   = mysql_escape($squid_status);
        $http_status    = mysql_escape($http_status);
        $reply_size     = mysql_escape($reply_size);
        $request_method = mysql_escape($request_method);
        $url            = mysql_escape($url);
        $username       = mysql_escape($username);
        $squid_connect  = mysql_escape($squid_connect);
        $server_ip      = mysql_escape($server_ip);
        $mime_type      = mysql_escape($mime_type);
 
        printlog("New access_log entry:\ntimestamp: $timestamp\nresponse_time: $response_time\n".
                 "client_ip: $client_ip\nsquid_status: $squid_status\nhttp_status: $http_status\n".
                 "reply_size: $reply_size\nrequest_method: $request_method\nurl: $url\n".
                 "username: $username\nsquid_connect: $squid_connect\n".
                 "server_ip: $server_ip\nmime_type: $mime_type\n\n") if ($opts{debug} ge 2);
 
        my $q = "INSERT INTO access_log ".
                "(proxy_host, timestamp, response_time, client_ip, squid_status, http_status, ".
                "reply_size, request_method, url, domain, username, squid_connect, server_ip, mime_type)".
                " VALUES('$host', '$timestamp','$response_time','$client_ip','$squid_status','$http_status',".
                "'$reply_size','$request_method','$url','$domain','$username','$squid_connect','$server_ip','$mime_type')";
 
        my $qh = $dbh->prepare($q);
        $qh->execute or exit(1);
    }
}
 
# log squid access
sub squidguard_log {
 
    my $logfile = shift;
 
    printlog("squidguard_log thread connecting to database") if ($opts{debug} ge 1);
    my $dbh = db_connect;
    # Open log file
    printlog("opening squidGuard log file") if ($opts{debug} ge 1);
    my $tail = File::Tail->new(name=>$logfile, maxinterval=>15);
 
    while (defined(my $line=$tail->read)){
        my ($date_day, $date_time, undef, $category, $url, $client_ip, $username) = split /\s+/, $line;
        # Clean some values
        $category =~ m/default\/(\w+)/;
        $category = $1;
        $client_ip =~ s/\/\-$//;
 
        my $uri = URI->new($url);
        my $domain = mysql_escape($uri->host);
 
        # MySQL escape
        $date_day  = mysql_escape($date_day);
        $date_time = mysql_escape($date_time);
        $category  = mysql_escape($category);
        $url       = mysql_escape($url);
        $client_ip = mysql_escape($client_ip);
        $username  = mysql_escape($username);
 
        printlog("New deny_log entry:\ndate: $date_day\ntime: $date_time\ncategory: $category\n".
                 "client_ip: $client_ip\nurl: $url\nusername: $username\n\n") if ($opts{debug} ge 2);
 
        my $q = "INSERT INTO deny_log ".
                "(proxy_host, date_day, date_time, category, client_ip, url, domain, username)".
                " VALUES('$host', '$date_day','$date_time','$category','$client_ip','$url','$domain','$username')";
 
        my $qh = $dbh->prepare($q);
        $qh->execute;
    }
}
 
printlog("Starting log monitoring threads") if ($opts{debug} ge 1);
my $thr1 = threads->create('squid_log', $opts{squidlog}) if ($opts{squid});
my $thr = 1;
if ($opts{squidguard}){
    my $thr2 = threads->create('squidguard_log', $opts{squidguardlog}) if ($opts{squidguard});
    $thr++;
}
 
while (scalar(threads->list(threads::running)) ge $thr){
    sleep(5);
}
 
die "At least one thread died\n";

Ce script peut être lancé en tant que squid (il a simplement besoin d'avoir accès en lecture aux journaux de squid). Ce script ne se détache pas de la console (je le fais tourner avec runit). On peut le lancer comme ça:

./squid-db-logd.pl --dbhost=127.0.0.1 --dbname=squid_log --dbuser=squid --dbpass=secret \
    --squidguard --squidlogfile=/var/log/squid/access.log --squidguardlogfile=/var/log/squidGuard/deny.log

Rotation et compression des tables

La journalisation dans une base de donnée peut consommer beaucoup d'espace disque (en fonction du nombre d'utilisateur derrière ce proxy). L'idée est donc de faire une rotation des tables access_log et deny_log une fois par mois. Les anciennes tables sont compressées (toujours consultables directement par MySQL, mais en lecture seule) et de supprimer les vieilles tables (par exemple, on conserve 1 an de journaux). Voilà un petit script cron à mettre en place:

squid-rotate.sh
#!/bin/bash
 
DB_HOST=localhost
# COnservation des journaux en nombre de jours
RETENTION=365
SQL_DB=squid_log
TABNAME="access_log deny_log"
SQLCMD="mysql ${SQL_DB} --batch -u squid -psecret";
MONTH=$(date +%m)
YEAR=$(date +%Y)
 
# We rotate on the first day of a new month
if [ "$MONTH" == "1" ]; then
    MONTH=12
else
    MONTH=$(($MONTH-1))
fi
 
# Pad with 0
MONTH=$(printf "%02d" $MONTH)
 
DATE=$MONTH"_"$YEAR
 
for T in ${TABNAME}; do
    # create table 0
    echo "CREATE TABLE IF NOT EXISTS ${T}_0 LIKE ${T};" | $SQLCMD;
 
    # Rotate table
    echo "FLUSH TABLES ${T}; RENAME TABLE ${T} TO ${T}_$DATE; RENAME TABLE ${T}_0 TO ${T}" | ${SQLCMD}  >/dev/null 2>&1
 
    # Drop _0 table if we rotate more than two times a month
    if echo "DESCRIBE ${T}_0;" | ${SQLCMD} >/dev/null 2>&1; then
        echo "DROP TABLE ${T}_0;" | $SQLCMD
    fi
 
    #compress 2
    cd /var/lib/mysql/${SQL_DB}/
    echo "FLUSH TABLE ${T}_${DATE};" | $SQLCMD
    myisampack -s "${T}_${DATE}.MYI"
    myisamchk -s -rq --sort-index --analyze "${T}_${DATE}.MYI"
    echo "FLUSH TABLE ${T}_${DATE}" | $SQLCMD
done
 
# Now check existing table to drop olds ones
for T in $(echo "show tables" | $SQLCMD | grep -v -P "^Tables_in_"$SQL_DB | grep -v -P "^(access|deny)_log$"); do
    TMONTH=$(echo $T | perl -pe 'm/^(access|deny)_log_(\d+)_(\d+)/; print $2;exit')
    TYEAR=$(echo $T | perl -pe 'm/^(access|deny)_log_(\d+)_(\d+)/; print $3;exit')
    # Drop table if older than configured retention
    if [ "$(($(date -d "01/$MONTH/$YEAR" +%s)-$(date -d "01/$TMONTH/$TYEAR" +%s)))" -gt "$((24*3600*$RETENTION))" ]; then
        echo "DROP TABLE $T;" | $SQLCMD
    fi
done

Quelques requêtes utiles

SELECT DOMAIN,COUNT(DOMAIN) AS occurances FROM access_log GROUP BY DOMAIN ORDER BY occurances DESC LIMIT 30;
SELECT category,COUNT(category) AS occurances FROM deny_log GROUP BY category ORDER BY occurances DESC LIMIT 10;
SELECT date_day,date_time,url,username FROM access_log WHERE client_ip='192.168.7.50' AND date_day='2012-10-08' AND date_time>'22:00:00' AND date_time<'23:00:00';
SELECT date_day,date_time,url,username INTO OUTFILE '/tmp/result.csv' FIELDS TERMINATED BY ','
OPTIONALLY ENCLOSED BY '"' ESCAPED BY '\\' LINES TERMINATED BY '\n'
FROM access_log WHERE client_ip='192.168.7.50' AND date_day='2012-10-08' AND date_time>'22:00:00' AND date_time<'23:00:00';