From 8d2a18fddc9659d98fcb38e87ac5453a31dcb58d Mon Sep 17 00:00:00 2001 From: Darold Gilles Date: Mon, 13 Aug 2012 21:51:20 +0200 Subject: [PATCH] CSV log parser have been entirely rewritten to handle csv with multiline, it also add approximative duration for csvlog. Reported by Ludhimila Kendrick --- pgbadger | 316 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 167 insertions(+), 149 deletions(-) diff --git a/pgbadger b/pgbadger index 9662718..e002f89 100755 --- a/pgbadger +++ b/pgbadger @@ -380,187 +380,213 @@ if ($last_parsed && -e $last_parsed) { foreach my $logfile (@log_files) { &logmsg('DEBUG', "Starting to parse log file: $logfile"); - # Open log file for reading - my $totalsize = (stat("$logfile"))[7] || 0; - my $cursize = 0; - my $lfile = new IO::File; - if ($logfile !~ /\.gz/) { - $lfile->open($logfile) || die "FATAL: cannot read logfile $logfile. $!\n"; - } else { - - # Open a pipe to zcat program for compressed log - $lfile->open("$zcat $logfile |") || die "FATAL: cannot read from pipe to $zcat $logfile. $!\n"; - - # Real size of the file is unknow - $totalsize = 0; - } - my $curdate = localtime(time); # Syslog do not have year information, so take care of year overlapping my ($gsec, $gmin, $ghour, $gmday, $gmon, $gyear, $gwday, $gyday, $gisdst) = localtime(time); $gyear += 1900; my $CURRENT_DATE = $gyear . sprintf("%02d", $gmon + 1) . sprintf("%02d", $gmday); + # Get size of the file + my $totalsize = (stat("$logfile"))[7] || 0; + my $cursize = 0; - my $cur_pid = ''; - my $csv_obj = undef; if ($format eq 'csv') { require Text::CSV; - $csv_obj = Text::CSV->new({'binary' => 1}); - } - - while (my $line = <$lfile>) { - $cursize += length($line); - chomp($line); - $line =~ s/ //; - $nlines++; - next if (!$line); - - if ($progress && (($nlines % $NUMPROGRESS) == 0)) { - if ($totalsize) { - print progress_bar($cursize, $totalsize, 25, '='); - } else { - print "."; + my $csv = Text::CSV->new ({ binary => 1, eol => $/ }); + open(my $io, "<", $logfile) or die "FATAL: cannot read csvlog file $logfile. $!\n"; + # Parse csvlog lines + while (my $row = $csv->getline($io)) { + + # Set progress statistics + $cursize += length(join(',', @$row)); + $nlines++; + if ($progress && (($nlines % $NUMPROGRESS) == 0)) { + if ($totalsize) { + print progress_bar($cursize, $totalsize, 25, '='); + } else { + print "."; + } } + + # Extract the date + $row->[0] =~ m/^(\d+)-(\d+)-(\d+)\s+(\d+):(\d+):(\d+)\.(\d+)/; + my @date = ($1, $2, $3, $4, $5, $6); + my $cur_date = join('', @date); + my $end_time = timegm_nocheck($6, $5, $4, $3, $2, $1); + my $milli = $7 || 0; + $row->[8] =~ m/^(\d+)-(\d+)-(\d+)\s+(\d+):(\d+):(\d+)/; + my $start_time = timegm_nocheck($6, $5, $4, $3, $2, $1); + my $duration = (($end_time - $start_time) * 1000) - $milli; + $duration = 0 if ($duration < 0); + + # Skip unwanted lines + next if ($from && ($from > $cur_date)); + last if ($to && ($to < $cur_date)); + + # Jump to the last line parsed if required + next if (!&check_incremental_position($cur_date, join(',', @$row))); + + # Set approximative session duration + if ($row->[13] && ($row->[13] !~ m/duration: \d+\.\d+ ms/)) { + $row->[13] = "duration: $duration ms $row->[13]"; + } + + # Process the log line + &parse_query( + @date, + $row->[4], # connection from + $row->[3], # pid + $row->[5], # session + 'user=' . $row->[1] . ',db=' . $row->[2], # logprefix + $row->[11], # loglevel + $row->[13] || $row->[14] || $row->[15], # message || detail || hint + ); } + $csv->eof or die "FATAL: cannot use CSV, " . $csv->error_diag() . "\n"; + close $io; + } else { - # Parse syslog lines - if ($format eq 'syslog') { + # Open log file for reading + my $lfile = new IO::File; + if ($logfile !~ /\.gz/) { + $lfile->open($logfile) || die "FATAL: cannot read log file $logfile. $!\n"; + } else { - if ($line =~ - /^(...)\s+(\d+)\s(\d+):(\d+):(\d+)\s([^\s]+)\s([^\[]+)\[(\d+)\]:\s\[([0-9\-]+)\]\s*([^:]*?)\s*([A-Z]+?):\s+(.*)/) - { + # Open a pipe to zcat program for compressed log + $lfile->open("$zcat $logfile |") || die "FATAL: cannot read from pipe to $zcat $logfile. $!\n"; - # skip non postgresql lines - next if ($7 ne $ident); + # Real size of the file is unknow + $totalsize = 0; + } - # Syslog do not have year information, so take care of year overlapping - my $tmp_year = $gyear; - if ("$tmp_year$month_abbr{$1}$2" > $CURRENT_DATE) { - $tmp_year = substr($CURRENT_DATE, 1, 4) - 1; - } + my $cur_pid = ''; + while (my $line = <$lfile>) { + $cursize += length($line); + chomp($line); + $line =~ s/ //; + $nlines++; + next if (!$line); - # Fix single digit days - my $day = sprintf("%02d", $2); - my $cur_date = "$tmp_year$month_abbr{$1}$day$3$4$5"; + if ($progress && (($nlines % $NUMPROGRESS) == 0)) { + if ($totalsize) { + print progress_bar($cursize, $totalsize, 25, '='); + } else { + print "."; + } + } - # Skip unwanted lines - next if ($from && ($from > $cur_date)); - last if ($to && ($to < $cur_date)); - $cur_pid = $8; + # Parse syslog lines + if ($format eq 'syslog') { - # Jump to the last line parsed if required - next if (!&check_incremental_position($cur_date, $line)); + if ($line =~ + /^(...)\s+(\d+)\s(\d+):(\d+):(\d+)\s([^\s]+)\s([^\[]+)\[(\d+)\]:\s\[([0-9\-]+)\]\s*([^:]*?)\s*([A-Z]+?):\s+(.*)/) + { - # Process the log line - &parse_query($tmp_year, $month_abbr{$1}, $day, $3, $4, $5, $6, $8, $9, $10, $11, $12); + # skip non postgresql lines + next if ($7 ne $ident); - } elsif ($line =~ /^(...)\s+(\d+)\s(\d+):(\d+):(\d+)\s([^\s]+)\s([^\[]+)\[(\d+)\]:\s\[([0-9\-]+)\]\s*(.*)/) { + # Syslog do not have year information, so take care of year overlapping + my $tmp_year = $gyear; + if ("$tmp_year$month_abbr{$1}$2" > $CURRENT_DATE) { + $tmp_year = substr($CURRENT_DATE, 1, 4) - 1; + } - $cur_pid = $8; - my $t_query = $10; - $t_query =~ s/#011/\t/g; - if ($cur_info{$cur_pid}{statement}) { - $cur_info{$cur_pid}{statement} .= "\n" . $t_query; - } elsif ($cur_info{$cur_pid}{context}) { - $cur_info{$cur_pid}{context} .= "\n" . $t_query; - } elsif ($cur_info{$cur_pid}{detail}) { - $cur_info{$cur_pid}{detail} .= "\n" . $t_query; - } else { - $cur_info{$cur_pid}{query} .= "\n" . $t_query; - } + # Fix single digit days + my $day = sprintf("%02d", $2); + my $cur_date = "$tmp_year$month_abbr{$1}$day$3$4$5"; + + # Skip unwanted lines + next if ($from && ($from > $cur_date)); + last if ($to && ($to < $cur_date)); + $cur_pid = $8; + + # Jump to the last line parsed if required + next if (!&check_incremental_position($cur_date, $line)); + + # Process the log line + &parse_query($tmp_year, $month_abbr{$1}, $day, $3, $4, $5, $6, $8, $9, $10, $11, $12); + + } elsif ($line =~ /^(...)\s+(\d+)\s(\d+):(\d+):(\d+)\s([^\s]+)\s([^\[]+)\[(\d+)\]:\s\[([0-9\-]+)\]\s*(.*)/) { + + $cur_pid = $8; + my $t_query = $10; + $t_query =~ s/#011/\t/g; + if ($cur_info{$cur_pid}{statement}) { + $cur_info{$cur_pid}{statement} .= "\n" . $t_query; + } elsif ($cur_info{$cur_pid}{context}) { + $cur_info{$cur_pid}{context} .= "\n" . $t_query; + } elsif ($cur_info{$cur_pid}{detail}) { + $cur_info{$cur_pid}{detail} .= "\n" . $t_query; + } else { + $cur_info{$cur_pid}{query} .= "\n" . $t_query; + } - # Collect orphans lines of multiline queries - } elsif ($line !~ /^...\s+\d+\s\d+:\d+:\d+\s[^\s]+\s[^\[]+\[\d+\]:/) { + # Collect orphans lines of multiline queries + } elsif ($line !~ /^...\s+\d+\s\d+:\d+:\d+\s[^\s]+\s[^\[]+\[\d+\]:/) { + + if ($cur_info{$cur_pid}{statement}) { + $cur_info{$cur_pid}{statement} .= "\n" . $line; + } elsif ($cur_info{$cur_pid}{context}) { + $cur_info{$cur_pid}{context} .= "\n" . $line; + } elsif ($cur_info{$cur_pid}{detail}) { + $cur_info{$cur_pid}{detail} .= "\n" . $line; + } else { + $cur_info{$cur_pid}{query} .= "\n" . $line; + } - if ($cur_info{$cur_pid}{statement}) { - $cur_info{$cur_pid}{statement} .= "\n" . $line; - } elsif ($cur_info{$cur_pid}{context}) { - $cur_info{$cur_pid}{context} .= "\n" . $line; - } elsif ($cur_info{$cur_pid}{detail}) { - $cur_info{$cur_pid}{detail} .= "\n" . $line; } else { - $cur_info{$cur_pid}{query} .= "\n" . $line; + &logmsg('DEBUG', "Unknown syslog line format: $line"); } - } else { - &logmsg('DEBUG', "Unknown syslog line format: $line"); - } + } elsif ($format eq 'stderr') { - } elsif ($format eq 'stderr') { + # Parse stderr lines + if ($line =~ /(\d+)-(\d+)-(\d+)\s(\d+):(\d+):(\d+)\s([^\s]+)\s\[(\d+)\]:\s\[([0-9\-]+)\]\s*([^:]*?)\s*([A-Z]+?):\s+(.*)/) + { - # Parse stderr lines - if ($line =~ /(\d+)-(\d+)-(\d+)\s(\d+):(\d+):(\d+)\s([^\s]+)\s\[(\d+)\]:\s\[([0-9\-]+)\]\s*([^:]*?)\s*([A-Z]+?):\s+(.*)/) - { + # Skip unwanted lines + my $cur_date = "$1$2$3$4$5$6"; + next if ($from && ($from > $cur_date)); + last if ($to && ($to < $cur_date)); + $cur_pid = $8; - # Skip unwanted lines - my $cur_date = "$1$2$3$4$5$6"; - next if ($from && ($from > $cur_date)); - last if ($to && ($to < $cur_date)); - $cur_pid = $8; + # Jump to the last line parsed if required + next if (!&check_incremental_position($cur_date, $line)); - # Jump to the last line parsed if required - next if (!&check_incremental_position($cur_date, $line)); + # Process the log line + &parse_query($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12); - # Process the log line - &parse_query($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12); + # Collect orphans lines of multiline queries + } elsif ($line !~ /^\d+-\d+-\d+\s\d+:\d+:\d+\s[^\s]+\s\[\d+\]:/) { - # Collect orphans lines of multiline queries - } elsif ($line !~ /^\d+-\d+-\d+\s\d+:\d+:\d+\s[^\s]+\s\[\d+\]:/) { + if ($cur_info{$cur_pid}{statement}) { + $cur_info{$cur_pid}{statement} .= "\n" . $line; + } elsif ($cur_info{$cur_pid}{context}) { + $cur_info{$cur_pid}{context} .= "\n" . $line; + } elsif ($cur_info{$cur_pid}{detail}) { + $cur_info{$cur_pid}{detail} .= "\n" . $line; + } else { + $cur_info{$cur_pid}{query} .= "\n" . $line; + } - if ($cur_info{$cur_pid}{statement}) { - $cur_info{$cur_pid}{statement} .= "\n" . $line; - } elsif ($cur_info{$cur_pid}{context}) { - $cur_info{$cur_pid}{context} .= "\n" . $line; - } elsif ($cur_info{$cur_pid}{detail}) { - $cur_info{$cur_pid}{detail} .= "\n" . $line; } else { - $cur_info{$cur_pid}{query} .= "\n" . $line; + $cur_info{$cur_pid}{query} .= "\n" . $line if ($cur_info{$cur_pid}{query}); } } else { - $cur_info{$cur_pid}{query} .= "\n" . $line if ($cur_info{$cur_pid}{query}); - } - - } elsif ($format eq 'csv') { - # Parse csvlog lines - if ($csv_obj->parse($line)) { - my @cols = $csv_obj->fields(); - - # Extract the date - $cols[0] =~ m/(\d+)-(\d+)-(\d+)\s+(\d+):(\d+):(\d+)/; - my @date = ($1, $2, $3, $4, $5, $6); - my $cur_date = join('', @date); - - # Skip unwanted lines - next if ($from && ($from > $cur_date)); - last if ($to && ($to < $cur_date)); - - # Jump to the last line parsed if required - next if (!&check_incremental_position($cur_date, $line)); - - # Process the log line - &parse_query( - @date, - $cols[4], # connection from - $cols[3], # pid - $cols[5], # session - # logprefix - 'user=' . $cols[1] . ',db=' . $cols[2], - $cols[11] . ':', # loglevel - $cols[13], # query - ); - } else { - &logmsg('DEBUG', "Unknown csv line format: $line, error: " . $csv_obj->error_input()); + # unknown format + &logmsg('DEBUG', "Unknown line format: $line"); } - - } else { - - # unknown format - &logmsg('DEBUG', "Unknown line format: $line"); } + $lfile->close(); + } + + # Get stats from all pending temporary storage + foreach my $pid (sort {$cur_info{$a}{date} <=> $cur_info{$b}{date}} %cur_info) { + &store_queries($pid); } + %cur_info = (); if ($progress) { if ($totalsize) { @@ -569,14 +595,6 @@ foreach my $logfile (@log_files) { print STDERR "\n"; } - $lfile->close(); - - # Get stats from all pending temporary storage - foreach my $pid (sort {$cur_info{$a}{date} <=> $cur_info{$b}{date}} %cur_info) { - &store_queries($pid); - } - %cur_info = (); - } # Save last line parsed @@ -3128,7 +3146,7 @@ sub parse_query # When we are ready to overwrite the last storage, add it to the global stats if ( ($t_loglevel =~ /LOG|FATAL|PANIC|ERROR|WARNING|HINT/) && exists $cur_info{$t_pid} - && ($t_session != $cur_info{$t_pid}{session})) + && (($format eq 'csv') || ($t_session != $cur_info{$t_pid}{session}))) { &store_queries($t_pid); delete $cur_info{$t_pid}; -- 2.50.1