日志解析完成,总访问量:641805 条
===== 统计结果 =====
总访问量:641805
独立IP数:14
最活跃IP:192.168.1.29(访问492948次)
访问高峰时段:16时(59997次)
图表已保存至:C:/Users/czliu/Downloads/nginx_analysis.png

import re
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from datetime import datetime
# ===================== 配置项(你的日志路径) =====================
LOG_PATH = r"C:\Users\czliu\Downloads\access.log"
# ==================================================================
# 辅助函数:清理字符串中的无效字符
def clean_string(s):
if isinstance(s, str):
# 移除非打印字符和二进制数据
return ''.join(c for c in s if c.isprintable())
return s
# 1. Nginx日志正则表达式(通用格式)
# 匹配格式:IP - - [时间] "请求" 状态码 大小 "来源" "UA"
log_pattern = re.compile(
r'(\d+\.\d+\.\d+\.\d+)\s+-\s+-\s+\[(.*?)\]\s+"(.*?)"\s+(\d+)\s+(\d+|-)'
)
# 2. 解析日志
parsed_data = []
with open(LOG_PATH, 'r', encoding='utf-8', errors='ignore') as f:
for line in f:
# 清理日志行中的无效字符
clean_line = clean_string(line)
match = log_pattern.match(clean_line.strip())
if match:
ip, time_str, request, status, size = match.groups()
# 清理各个字段
ip = clean_string(ip)
time_str = clean_string(time_str)
request = clean_string(request)
status = clean_string(status)
size = clean_string(size)
# 处理时间
dt = None
try:
# 日志时间格式:03/Jan/2025:12:34:56 +0800
dt = datetime.strptime(time_str, '%d/%b/%Y:%H:%M:%S %z')
hour = dt.hour # 按小时统计
except:
hour = -1
# 提取请求方法和URL
req_parts = request.split()
method = req_parts[0] if len(req_parts) > 0 else 'UNKNOWN'
url = req_parts[1] if len(req_parts) > 1 else '/'
# 再次清理方法和URL
method = clean_string(method)
url = clean_string(url)
parsed_data.append({
'ip': ip,
'time': dt,
'hour': hour,
'method': method,
'url': url,
'status': int(status),
'size': int(size) if size != '-' else 0
})
# 转为DataFrame方便分析
df = pd.DataFrame(parsed_data)
print(f"日志解析完成,总访问量:{len(df)} 条")
# 清理数据
if len(df) > 0:
df['ip'] = df['ip'].apply(clean_string)
df['method'] = df['method'].apply(clean_string)
df['url'] = df['url'].apply(clean_string)
# 3. 分析核心数据
# 3.1 访问量TOP10 IP
ip_top10 = Counter(df['ip']).most_common(10)
# 3.2 按小时访问趋势
hour_counts = df[df['hour'] != -1]['hour'].value_counts().sort_index()
# 3.3 HTTP状态码分布
status_counts = df['status'].value_counts()
# 3.4 请求方法分布
method_counts = df['method'].value_counts()
# 4. 绘制综合图表(2x2子图)
if len(df) > 0:
plt.rcParams['font.sans-serif'] = ['SimHei'] # 解决中文乱码
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['text.usetex'] = False # 禁用LaTeX
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 10))
fig.suptitle('Nginx访问日志分析报告', fontsize=18, fontweight='bold')
# 子图1:TOP10访问IP
if ip_top10:
# 清理IP地址中的特殊字符并限制长度
clean_ips = []
for ip, _ in ip_top10:
clean_ip = clean_string(ip)
# 限制IP显示长度,避免过长
if len(clean_ip) > 15:
clean_ip = clean_ip[:15] + '...'
clean_ips.append(clean_ip)
ax1.barh(clean_ips, [cnt for _, cnt in ip_top10], color='#1f77b4')
ax1.set_title('TOP10 访问IP', fontsize=14)
ax1.set_xlabel('访问次数')
else:
ax1.set_title('TOP10 访问IP', fontsize=14)
ax1.text(0.5, 0.5, '无数据', ha='center', va='center')
# 子图2:小时访问趋势
if not hour_counts.empty:
ax2.plot(hour_counts.index, hour_counts.values, marker='o', color='#ff7f0e', linewidth=2)
ax2.set_title('24小时访问趋势', fontsize=14)
ax2.set_xlabel('小时')
ax2.set_ylabel('访问量')
ax2.grid(alpha=0.3)
else:
ax2.set_title('24小时访问趋势', fontsize=14)
ax2.text(0.5, 0.5, '无数据', ha='center', va='center')
# 子图3:HTTP状态码分布
if not status_counts.empty:
colors = ['#2ca02c', '#d62728', '#9467bd', '#8c564b']
# 确保标签是字符串且干净
labels = [str(label) for label in status_counts.index]
ax3.pie(status_counts.values, labels=labels, autopct='%1.1f%%', colors=colors[:len(status_counts)])
ax3.set_title('HTTP状态码分布', fontsize=14)
else:
ax3.set_title('HTTP状态码分布', fontsize=14)
ax3.text(0.5, 0.5, '无数据', ha='center', va='center')
# 子图4:请求方法分布
if not method_counts.empty:
# 清理请求方法中的特殊字符并过滤掉无效方法
clean_methods = []
clean_values = []
for method, cnt in method_counts.items():
clean_method = clean_string(method)
# 只保留有效的HTTP方法
if clean_method in ['GET', 'POST', 'PUT', 'DELETE', 'HEAD', 'OPTIONS'] or cnt > 10:
clean_methods.append(clean_method)
clean_values.append(cnt)
if clean_methods:
ax4.bar(clean_methods, clean_values, color='#e377c2')
ax4.set_title('请求方法分布', fontsize=14)
ax4.set_ylabel('次数')
else:
ax4.set_title('请求方法分布', fontsize=14)
ax4.text(0.5, 0.5, '无数据', ha='center', va='center')
else:
ax4.set_title('请求方法分布', fontsize=14)
ax4.text(0.5, 0.5, '无数据', ha='center', va='center')
plt.tight_layout()
plt.savefig(r'C:\Users\czliu\Downloads\nginx_analysis.png', dpi=300, bbox_inches='tight')
plt.close() # 关闭图表,避免在非交互式环境中出现问题
# 5. 打印关键统计
print("\n===== 统计结果 =====")
print(f"总访问量:{len(df)}")
print(f"独立IP数:{df['ip'].nunique()}")
if ip_top10:
print(f"最活跃IP:{ip_top10[0][0]}(访问{ip_top10[0][1]}次)")
else:
print("最活跃IP:无数据")
if not hour_counts.empty:
print(f"访问高峰时段:{hour_counts.idxmax()}时({hour_counts.max()}次)")
else:
print("访问高峰时段:无数据")
if len(df) > 0:
print(f"图表已保存至:C:/Users/czliu/Downloads/nginx_analysis.png")
else:
print("图表:无数据,未生成")
