using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Web;
using NewLife;
using NewLife.Caching;
using NewLife.Log;
using NewLife.Serialization;
using NewLife.Threading;
using Stardust.Data;
using Stardust.Data.Monitors;
using Stardust.Data.Nodes;
namespace Stardust.Server.Services
{
public interface IAlarmService
{
/// <summary>添加需要统计的应用,去重</summary>
/// <param name="appId"></param>
void Add(Int32 appId);
}
public class AlarmService : IAlarmService
{
/// <summary>计算周期。默认30秒</summary>
public Int32 Period { get; set; } = 30;
private readonly TimerX _timer;
private readonly ConcurrentBag<Int32> _bag = new();
//private WeiXinClient _weixin;
//private DingTalkClient _dingTalk;
private readonly ICache _cache = new MemoryCache();
private readonly ITracer _tracer;
public AlarmService(ITracer tracer)
{
// 初始化定时器
_timer = new TimerX(DoAlarm, null, 5_000, Period * 1000) { Async = true };
_tracer = tracer;
}
/// <summary>添加需要统计的应用,去重</summary>
/// <param name="appId"></param>
public void Add(Int32 appId)
{
if (!_bag.Contains(appId)) _bag.Add(appId);
}
private void DoAlarm(Object state)
{
while (_bag.TryTake(out var appId))
{
//Process(appId);
}
// 应用告警
var list = AppTracer.FindAllWithCache();
foreach (var item in list)
{
ProcessAppTracer(item);
}
// 节点告警
var nodes = Node.FindAllWithCache();
foreach (var item in nodes)
{
ProcessNode(item);
}
// Redis告警
var rnodes = RedisNode.FindAllWithCache();
foreach (var item in rnodes)
{
ProcessRedisNode(item);
}
if (Period > 0) _timer.Period = Period * 1000;
}
#region 应用性能追踪告警
private void ProcessAppTracer(AppTracer app)
{
// 应用是否需要告警
if (app == null || !app.Enable || app.AlarmThreshold <= 0) return;
var appId = app.ID;
if (!RobotHelper.CanAlarm(app.Category, app.AlarmRobot)) return;
using var span = _tracer?.NewSpan($"Alarm:{nameof(AppTracer)}");
// 最近一段时间的5分钟级数据
var time = DateTime.Now;
var minute = time.Date.AddHours(time.Hour).AddMinutes(time.Minute / 5 * 5);
var st = AppMinuteStat.FindByAppIdAndTime(appId, minute);
if (st == null) return;
// 判断告警
if (st.Errors >= app.AlarmThreshold)
{
// 一定时间内不要重复报错,除非错误翻倍
var error2 = _cache.Get<Int32>("alarm:AppTracer:" + appId);
if (error2 == 0 || st.Errors > error2 * 2)
{
_cache.Set("alarm:AppTracer:" + appId, st.Errors, 5 * 60);
var msg = GetMarkdown(app, st, true);
RobotHelper.SendAlarm(app.Category, app.AlarmRobot, "系统告警", msg);
}
}
}
private static String GetMarkdown(AppTracer app, AppMinuteStat st, Boolean includeTitle)
{
var sb = new StringBuilder();
if (includeTitle) sb.AppendLine($"### [{app}]系统告警");
sb.AppendLine($">**总数:**<font color=\"red\">{st.Errors}</font>");
var url = Setting.Current.WebUrl;
var appUrl = "";
var traceUrl = "";
if (!url.IsNullOrEmpty())
{
appUrl = url.EnsureEnd("/") + "Monitors/appMinuteStat?appId=" + st.AppId + "&minError=1";
traceUrl = url.EnsureEnd("/") + "Monitors/traceMinuteStat?appId=" + st.AppId + "&minError=1";
}
// 找找具体接口错误
var names = new List<String>();
var sts = TraceMinuteStat.FindAllByAppIdAndTime(st.AppId, st.StatTime).OrderByDescending(e => e.Errors).ToList();
foreach (var item in sts)
{
if (item.Errors > 0)
{
sb.AppendLine($">**错误:**<font color=\"red\">{item.StatTime:HH:mm:ss} 埋点[{item.Name}]报错[{item.Errors:n0}]次</font>[更多]({traceUrl}&itemId={item.ItemId})");
// 相同接口的错误,不要报多次
if (!names.Contains(item.Name))
{
var ds = TraceData.Search(st.AppId, item.ItemId, "minute", item.StatTime, 20);
if (ds.Count > 0)
{
var sms = SampleData.FindAllByDataIds(ds.Select(e => e.Id).ToArray(), item.StatTime).Where(e => !e.Error.IsNullOrEmpty()).ToList();
if (sms.Count > 0)
{
var msg = sms[0].Error?.Trim();
if (!msg.IsNullOrEmpty())
{
// 错误内容取第一行,详情看更多
var p = msg.IndexOfAny(new[] { '\r', '\n' });
if (p > 0) msg = msg[..p];
sb.AppendLine($">**内容:**{msg}");
names.Add(item.Name);
}
}
}
}
}
}
var str = sb.ToString();
if (str.Length > 1600) str = str[..1600];
// 构造网址
if (!appUrl.IsNullOrEmpty())
{
str += Environment.NewLine + $"[更多信息]({appUrl})";
}
return str;
}
#endregion
#region 节点告警
private void ProcessNode(Node node)
{
if (node == null || !node.Enable || !RobotHelper.CanAlarm(node.Category, node.WebHook)) return;
if (node.AlarmCpuRate <= 0 && node.AlarmMemoryRate <= 0 && node.AlarmDiskRate <= 0 && node.AlarmProcesses.IsNullOrEmpty()) return;
using var span = _tracer?.NewSpan($"Alarm:{nameof(Node)}");
// 最新数据
var data = NodeData.FindLast(node.ID);
if (data == null) return;
// CPU告警
if (node.AlarmCpuRate > 0)
{
var rate = data.CpuRate * 100;
if (rate >= node.AlarmCpuRate)
{
// 一定时间内不要重复报错,除非错误翻倍
var error2 = _cache.Get<Int32>("alarm:CpuRate:" + node.ID);
if (error2 == 0 || rate > error2 * 2)
{
_cache.Set("alarm:CpuRate:" + node.ID, rate, 5 * 60);
SendAlarm("cpu", node, data, $"[{node.Name}]CPU告警");
}
}
}
// 内存告警
if (node.AlarmMemoryRate > 0 && node.Memory > 0)
{
var rate = (node.Memory - data.AvailableMemory) * 100d / node.Memory;
if (rate >= node.AlarmMemoryRate)
{
// 一定时间内不要重复报错,除非错误翻倍
var error2 = _cache.Get<Int32>("alarm:MemoryRate:" + node.ID);
if (error2 == 0 || rate > error2 * 2)
{
_cache.Set("alarm:MemoryRate:" + node.ID, rate, 5 * 60);
SendAlarm("memory", node, data, $"[{node.Name}]内存告警");
}
}
}
// 磁盘告警
if (node.AlarmDiskRate > 0 && node.TotalSize > 0)
{
var rate = (node.TotalSize - data.AvailableFreeSpace) * 100d / node.TotalSize;
if (rate >= node.AlarmDiskRate)
{
// 一定时间内不要重复报错,除非错误翻倍
var error2 = _cache.Get<Int32>("alarm:DiskRate:" + node.ID);
if (error2 == 0 || rate > error2 * 2)
{
_cache.Set("alarm:DiskRate:" + node.ID, rate, 5 * 60);
SendAlarm("disk", node, data, $"[{node.Name}]磁盘告警");
}
}
}
// TCP告警
if (node.AlarmTcp > 0)
{
var tcp = data.TcpConnections;
if (tcp < data.TcpTimeWait) tcp = data.TcpTimeWait;
if (tcp < data.TcpCloseWait) tcp = data.TcpCloseWait;
if (tcp >= node.AlarmTcp)
{
// 一定时间内不要重复报错,除非错误翻倍
var error2 = _cache.Get<Int32>("alarm:Tcp:" + node.ID);
if (error2 == 0 || tcp > error2 * 2)
{
_cache.Set("alarm:Tcp:" + node.ID, tcp, 5 * 60);
SendAlarm("tcp", node, data, $"[{node.Name}]Tcp告警");
}
}
}
// 进程告警
if (!node.AlarmProcesses.IsNullOrEmpty())
{
var olt = NodeOnline.FindByNodeId(node.ID);
if (olt != null && !olt.Processes.IsNullOrEmpty())
{
var alarms = node.AlarmProcesses.Split(",", StringSplitOptions.RemoveEmptyEntries);
var ps = olt.Processes?.Split(",", StringSplitOptions.RemoveEmptyEntries);
if (alarms != null && alarms.Length > 0 && ps != null && ps.Length > 0)
{
// 查找丢失的进程
var ps2 = alarms.Where(e => !ps.Contains(e)).ToList();
if (ps2.Count > 0)
{
// 一定时间内不要重复报错
var error2 = _cache.Get<Int32>("alarm:Process:" + node.ID);
if (error2 == 0 || ps2.Count > error2)
{
_cache.Set("alarm:Process:" + node.ID, ps2.Count, 5 * 60);
SendAlarm("process", node, data, $"[{node.Name}]进程守护告警", ps2.Join());
}
}
}
}
}
}
private void SendAlarm(String kind, Node node, NodeData data, String title, String info = null)
{
var msg = GetMarkdown(kind, node, data, title, info);
RobotHelper.SendAlarm(node.Category, node.WebHook, title, msg);
}
private static String GetMarkdown(String kind, Node node, NodeData data, String title, String msg = null)
{
var sb = new StringBuilder();
if (!title.IsNullOrEmpty()) sb.AppendLine($"### {title}");
sb.AppendLine($">**节点:**<font color=\"gray\">{node} / {node.IP}</font>");
sb.AppendLine($">**分类:**<font color=\"gray\">{node.Category}</font>");
sb.AppendLine($">**系统:**<font color=\"gray\">{node.OS}</font>");
sb.AppendLine($">**CPU核心:**<font color=\"gray\">{node.Cpu}</font>");
sb.AppendLine($">**内存容量:**<font color=\"gray\">{node.Memory:n0}M,可用 {data.AvailableMemory:n0}M</font>");
sb.AppendLine($">**磁盘容量:**<font color=\"gray\">{node.TotalSize:n0}M,可用 {data.AvailableFreeSpace:n0}M</font>");
switch (kind)
{
case "cpu":
sb.AppendLine($">**CPU使用率:**<font color=\"red\">{data.CpuRate:p0} >= {node.AlarmCpuRate / 100d:p0}</font>");
break;
case "memory":
var rate1 = 1 - (node.Memory == 0 ? 0 : ((Double)data.AvailableMemory / node.Memory));
sb.AppendLine($">**内存使用率:**<font color=\"red\">{rate1:p0} >= {node.AlarmMemoryRate / 100d:p0}</font>");
break;
case "disk":
var rate2 = 1 - (node.TotalSize == 0 ? 0 : ((Double)data.AvailableFreeSpace / node.TotalSize));
sb.AppendLine($">**磁盘使用率:**<font color=\"red\"> {rate2:p0} >= {node.AlarmDiskRate / 100d:p0}</font>");
break;
case "tcp":
if (data.TcpConnections >= node.AlarmTcp)
sb.AppendLine($">**TCP连接数:**<font color=\"red\">{data.TcpConnections:n0} >= {node.AlarmTcp:n0}</font>");
if (data.TcpTimeWait >= node.AlarmTcp)
sb.AppendLine($">**TCP主动关闭:**<font color=\"red\">{data.TcpTimeWait:n0} >= {node.AlarmTcp:n0}</font>");
if (data.TcpCloseWait >= node.AlarmTcp)
sb.AppendLine($">**TCP被动关闭:**<font color=\"red\">{data.TcpCloseWait:n0} >= {node.AlarmTcp:n0}</font>");
break;
case "process":
sb.AppendLine($">**进程已退出:**<font color=\"red\">{msg}</font>");
break;
}
var str = sb.ToString();
if (str.Length > 2000) str = str[..2000];
// 构造网址
var url = Setting.Current.WebUrl;
if (!url.IsNullOrEmpty())
{
url = url.EnsureEnd("/") + "Nodes/NodeData?nodeId=" + node.ID;
str += Environment.NewLine + $"[更多信息]({url})";
}
return str;
}
#endregion
#region Redis告警
private void ProcessRedisNode(RedisNode node)
{
if (node == null || !node.Enable || node.WebHook.IsNullOrEmpty()) return;
ProcessRedisData(node);
ProcessRedisQueue(node);
}
private void ProcessRedisData(RedisNode node)
{
if (!RobotHelper.CanAlarm(node.Category, node.WebHook)) return;
if (node.AlarmMemoryRate <= 0 || node.AlarmConnections == 0) return;
// 最新数据
var data = RedisData.FindLast(node.Id);
if (data == null) return;
using var span = _tracer?.NewSpan($"Alarm:{nameof(RedisNode)}");
var actions = new List<Action<StringBuilder>>();
// 内存告警
var rate = data.UsedMemory * 100 / node.MaxMemory;
if (rate >= node.AlarmMemoryRate)
{
// 一定时间内不要重复报错,除非错误翻倍
var error2 = _cache.Get<Int32>("alarm:RedisMemory:" + node.Id);
if (error2 == 0 || rate > error2 * 2)
{
_cache.Set("alarm:RedisMemory:" + node.Id, rate, 5 * 60);
actions.Add(sb => sb.AppendLine($">**内存告警:**<font color=\"red\">{rate / 100:p0} >= {node.AlarmMemoryRate / 100:p0}</font>"));
}
}
// 连接数告警
var cs = data.ConnectedClients;
if (node.AlarmConnections > 0 && cs >= node.AlarmConnections)
{
// 一定时间内不要重复报错,除非错误翻倍
var error2 = _cache.Get<Int32>("alarm:RedisConnections:" + node.Id);
if (error2 == 0 || cs > error2 * 2)
{
_cache.Set("alarm:RedisConnections:" + node.Id, cs, 5 * 60);
actions.Add(sb => sb.AppendLine($">**连接数告警:**<font color=\"red\">{cs:n0} >= {node.AlarmConnections:n0}</font>"));
}
}
// 速度告警
var speed = data.Speed;
if (node.AlarmSpeed > 0 && speed >= node.AlarmSpeed)
{
// 一定时间内不要重复报错,除非错误翻倍
var error2 = _cache.Get<Int32>("alarm:RedisSpeed:" + node.Id);
if (error2 == 0 || speed > error2 * 2)
{
_cache.Set("alarm:RedisSpeed:" + node.Id, speed, 5 * 60);
actions.Add(sb => sb.AppendLine($">**速度告警:**<font color=\"red\">{speed:n0} >= {node.AlarmSpeed:n0}</font>"));
}
}
// 入流量告警
var input = data.InputKbps;
if (node.AlarmInputKbps > 0 && input >= node.AlarmInputKbps)
{
// 一定时间内不要重复报错,除非错误翻倍
var error2 = _cache.Get<Int32>("alarm:RedisInputKbps:" + node.Id);
if (error2 == 0 || input > error2 * 2)
{
_cache.Set("alarm:RedisInputKbps:" + node.Id, input, 5 * 60);
actions.Add(sb => sb.AppendLine($">**入流量告警:**<font color=\"red\">{input:n0} >= {node.AlarmInputKbps:n0}</font>"));
}
}
// 出流量告警
var output = data.OutputKbps;
if (node.AlarmOutputKbps > 0 && output >= node.AlarmOutputKbps)
{
// 一定时间内不要重复报错,除非错误翻倍
var error2 = _cache.Get<Int32>("alarm:RedisOutputKbps:" + node.Id);
if (error2 == 0 || output > error2 * 2)
{
_cache.Set("alarm:RedisOutputKbps:" + node.Id, output, 5 * 60);
actions.Add(sb => sb.AppendLine($">**出流量告警:**<font color=\"red\">{output:n0} >= {node.AlarmOutputKbps:n0}</font>"));
}
}
if (actions.Count > 0)
{
var msg = GetMarkdown(node, data, "Redis告警", actions);
RobotHelper.SendAlarm(node.Category, node.WebHook, "Redis告警", msg);
}
}
private static String GetMarkdown(RedisNode node, RedisData data, String title, IList<Action<StringBuilder>> actions)
{
var sb = new StringBuilder();
if (!title.IsNullOrEmpty()) sb.AppendLine($"### [{node}]{title}");
sb.AppendLine($">**分类:**<font color=\"gray\">{node.Category}</font>");
sb.AppendLine($">**版本:**<font color=\"gray\">{node.Version}</font>");
sb.AppendLine($">**已用内存:**<font color=\"gray\">{data.UsedMemory:n0}</font>");
sb.AppendLine($">**内存容量:**<font color=\"gray\">{node.MaxMemory:n0}</font>");
sb.AppendLine($">**连接数:**<font color=\"gray\">{data.ConnectedClients:n0}</font>");
sb.AppendLine($">**服务器:**<font color=\"gray\">{node.Server}</font>");
//var rate = node.MaxMemory == 0 ? 0 : (data.UsedMemory * 100 / node.MaxMemory);
//if (rate >= node.AlarmMemoryRate && node.AlarmMemoryRate > 0)
//{
// sb.AppendLine($">**内存告警:**<font color=\"info\">{data.UsedMemory}/{node.MaxMemory} >= {node.AlarmMemoryRate:p0}</font>");
//}
//if (node.AlarmConnections > 0 && data.ConnectedClients >= node.AlarmConnections)
//{
// sb.AppendLine($">**连接告警:**<font color=\"info\">{data.ConnectedClients:n0} >= {node.AlarmConnections:n0}</font>");
//}
foreach (var item in actions)
{
item(sb);
}
var str = sb.ToString();
if (str.Length > 2000) str = str[..2000];
// 构造网址
var url = Setting.Current.WebUrl;
if (!url.IsNullOrEmpty())
{
url = url.EnsureEnd("/") + "Nodes/RedisNode?id=" + node.Id;
str += Environment.NewLine + $"[更多信息]({url})";
}
return str;
}
#endregion
#region Redis队列告警
private void ProcessRedisQueue(RedisNode node)
{
using var span = _tracer?.NewSpan($"Alarm:{nameof(RedisMessageQueue)}");
// 所有队列
var list = RedisMessageQueue.FindAllByRedisId(node.Id);
foreach (var queue in list)
{
var groupName = !queue.Category.IsNullOrEmpty() ? queue.Category : node.Category;
var webhook = !queue.WebHook.IsNullOrEmpty() ? queue.WebHook : node.WebHook;
// 判断告警
if (queue.Enable && queue.MaxMessages > 0 && queue.Messages >= queue.MaxMessages && RobotHelper.CanAlarm(groupName, webhook))
{
// 一定时间内不要重复报错,除非错误翻倍
var error2 = _cache.Get<Int32>("alarm:RedisMessageQueue:" + queue.Id);
if (error2 == 0 || queue.Messages > error2 * 2)
{
_cache.Set("alarm:RedisMessageQueue:" + queue.Id, queue.Messages, 5 * 60);
var msg = GetMarkdown(node, queue, true);
RobotHelper.SendAlarm(groupName, webhook, "消息队列告警", msg);
}
}
}
}
private static String GetMarkdown(RedisNode node, RedisMessageQueue queue, Boolean includeTitle)
{
var sb = new StringBuilder();
if (includeTitle) sb.AppendLine($"### [{queue.Name}/{node}]消息队列告警");
sb.AppendLine($">**主题:**<font color=\"gray\">{queue.Topic}</font>");
sb.AppendLine($">**积压:**<font color=\"red\">{queue.Messages:n0} > {queue.MaxMessages:n0}</font>");
sb.AppendLine($">**消费者:**<font color=\"green\">{queue.Consumers}</font>");
sb.AppendLine($">**总消费:**<font color=\"green\">{queue.Total:n0}</font>");
sb.AppendLine($">**服务器:**<font color=\"gray\">{node.Server}</font>");
var str = sb.ToString();
if (str.Length > 2000) str = str[..2000];
// 构造网址
var url = Setting.Current.WebUrl;
if (!url.IsNullOrEmpty())
{
url = url.EnsureEnd("/") + "Nodes/RedisMessageQueue?redisId=" + queue.RedisId + "&q=" + queue.Name;
str += Environment.NewLine + $"[更多信息]({url})";
}
return str;
}
#endregion
}
}
|