Mini Shell
local stream_sock = ngx.socket.tcp
local log = ngx.log
local ERR = ngx.ERR
local WARN = ngx.WARN
local DEBUG = ngx.DEBUG
local ngx = ngx
local error = error
local string = string
local sub = string.sub
local re_find = ngx.re.find
local new_timer = ngx.timer.at
local shared = ngx.shared
local debug_mode = ngx.config.debug
local concat = table.concat
local tonumber = tonumber
local tostring = tostring
local ipairs = ipairs
local ceil = math.ceil
local spawn = ngx.thread.spawn
local wait = ngx.thread.wait
local pcall = pcall
local setmetatable = setmetatable
-- LuaFormatter off
local _M = {
_VERSION = '0.08'
}
if not ngx.config
or not ngx.config.ngx_lua_version
or ngx.config.ngx_lua_version < 9005
then
error("ngx_lua 0.9.5+ required")
end
-- LuaFormatter on
local ok, upstream = pcall(require, "ngx.upstream")
if not ok then
error("ngx_upstream_lua module required")
end
local ok, new_tab = pcall(require, "table.new")
if not ok or type(new_tab) ~= "function" then
new_tab = function(narr, nrec)
return {}
end
end
local set_peer_down = upstream.set_peer_down
local get_primary_peers = upstream.get_primary_peers
local get_backup_peers = upstream.get_backup_peers
local get_upstreams = upstream.get_upstreams
local upstream_checker_statuses = {}
local function warn(...)
log(WARN, "healthcheck: ", ...)
end
local function errlog(...)
log(ERR, "healthcheck: ", ...)
end
local function debug(...)
-- print("debug mode: ", debug_mode)
if debug_mode then
log(DEBUG, "healthcheck: ", ...)
end
end
local function gen_peer_key(prefix, u, is_backup, id)
if is_backup then
return prefix .. u .. ":b" .. id
end
return prefix .. u .. ":p" .. id
end
local function set_peer_down_globally(ctx, is_backup, id, value)
local u = ctx.upstream
local dict = ctx.dict
local ok, err = set_peer_down(u, is_backup, id, value)
if not ok then
errlog("failed to set peer down: ", err)
end
if not ctx.new_version then
ctx.new_version = true
end
local key = gen_peer_key("d:", u, is_backup, id)
local ok, err = dict:set(key, value)
if not ok then
errlog("failed to set peer down state: ", err)
end
end
local function peer_fail(ctx, is_backup, id, peer)
debug("peer ", peer.name, " was checked to be not ok")
local u = ctx.upstream
local dict = ctx.dict
local key = gen_peer_key("nok:", u, is_backup, id)
local fails, err = dict:get(key)
if not fails then
if err then
errlog("failed to get peer nok key: ", err)
return
end
fails = 1
-- below may have a race condition, but it is fine for our
-- purpose here.
local ok, err = dict:set(key, 1)
if not ok then
errlog("failed to set peer nok key: ", err)
end
else
fails = fails + 1
local ok, err = dict:incr(key, 1)
if not ok then
errlog("failed to incr peer nok key: ", err)
end
end
if fails == 1 then
key = gen_peer_key("ok:", u, is_backup, id)
local succ, err = dict:get(key)
if not succ or succ == 0 then
if err then
errlog("failed to get peer ok key: ", err)
return
end
else
local ok, err = dict:set(key, 0)
if not ok then
errlog("failed to set peer ok key: ", err)
end
end
end
-- print("ctx fall: ", ctx.fall, ", peer down: ", peer.down,
-- ", fails: ", fails)
if not peer.down and fails >= ctx.fall then
warn("peer ", peer.name, " is turned down after ", fails, " failure(s)")
peer.down = true
set_peer_down_globally(ctx, is_backup, id, true)
end
end
local function peer_ok(ctx, is_backup, id, peer)
debug("peer ", peer.name, " was checked to be ok")
local u = ctx.upstream
local dict = ctx.dict
local key = gen_peer_key("ok:", u, is_backup, id)
local succ, err = dict:get(key)
if not succ then
if err then
errlog("failed to get peer ok key: ", err)
return
end
succ = 1
-- below may have a race condition, but it is fine for our
-- purpose here.
local ok, err = dict:set(key, 1)
if not ok then
errlog("failed to set peer ok key: ", err)
end
else
succ = succ + 1
local ok, err = dict:incr(key, 1)
if not ok then
errlog("failed to incr peer ok key: ", err)
end
end
if succ == 1 then
key = gen_peer_key("nok:", u, is_backup, id)
local fails, err = dict:get(key)
if not fails or fails == 0 then
if err then
errlog("failed to get peer nok key: ", err)
return
end
else
local ok, err = dict:set(key, 0)
if not ok then
errlog("failed to set peer nok key: ", err)
end
end
end
if peer.down and succ >= ctx.rise then
warn("peer ", peer.name, " is turned up after ", succ, " success(es)")
peer.down = nil
set_peer_down_globally(ctx, is_backup, id, nil)
end
end
-- shortcut error function for check_peer()
local function peer_error(ctx, is_backup, id, peer, ...)
if not peer.down then
errlog(...)
end
peer_fail(ctx, is_backup, id, peer)
end
local function check_peer(ctx, id, peer, is_backup)
local ok
local name = peer.name
local statuses = ctx.statuses
local req = ctx.http_req
local sock, err = stream_sock()
if not sock then
errlog("failed to create stream socket: ", err)
return
end
sock:settimeout(ctx.timeout)
if peer.host then
-- print("peer port: ", peer.port)
ok, err = sock:connect(peer.host, peer.port)
else
ok, err = sock:connect(name)
end
if not ok then
if not peer.down then
errlog("failed to connect to ", name, ": ", err)
end
return peer_fail(ctx, is_backup, id, peer)
end
if ctx.type == "https" then
ok, err = sock:sslhandshake(nil, ctx.host, ctx.ssl_verify)
if not ok then
sock:close()
return peer_error(ctx, is_backup, id, peer,
"failed to ssl handshake to ", name, ": ", err)
end
end
local bytes, err = sock:send(req)
if not bytes then
return peer_error(ctx, is_backup, id, peer,
"failed to send request to ", name, ": ", err)
end
local status_line, err = sock:receive()
if not status_line then
peer_error(ctx, is_backup, id, peer,
"failed to receive status line from ", name, ": ", err)
if err == "timeout" then
sock:close() -- timeout errors do not close the socket.
end
return
end
if statuses then
local from, to, err = re_find(status_line, [[^HTTP/\d+\.\d+\s+(\d+)]],
"joi", nil, 1)
if err then
errlog("failed to parse status line: ", err)
end
if not from then
peer_error(ctx, is_backup, id, peer, "bad status line from ", name,
": ", status_line)
sock:close()
return
end
local status = tonumber(sub(status_line, from, to))
if not statuses[status] then
peer_error(ctx, is_backup, id, peer, "bad status code from ", name,
": ", status)
sock:close()
return
end
end
peer_ok(ctx, is_backup, id, peer)
sock:close()
end
local function check_peer_range(ctx, from, to, peers, is_backup)
for i = from, to do
check_peer(ctx, i - 1, peers[i], is_backup)
end
end
local function check_peers(ctx, peers, is_backup)
local n = #peers
if n == 0 then
return
end
local concur = ctx.concurrency
if concur <= 1 then
for i = 1, n do
check_peer(ctx, i - 1, peers[i], is_backup)
end
else
local threads
local nthr
if n <= concur then
nthr = n - 1
threads = new_tab(nthr, 0)
for i = 1, nthr do
if debug_mode then
debug("spawn a thread checking ",
is_backup and "backup" or "primary", " peer ", i - 1)
end
threads[i] = spawn(check_peer, ctx, i - 1, peers[i], is_backup)
end
-- use the current "light thread" to run the last task
if debug_mode then
debug("check ", is_backup and "backup" or "primary", " peer ",
n - 1)
end
check_peer(ctx, n - 1, peers[n], is_backup)
else
local group_size = ceil(n / concur)
nthr = ceil(n / group_size) - 1
threads = new_tab(nthr, 0)
local from = 1
local rest = n
for i = 1, nthr do
local to
if rest >= group_size then
rest = rest - group_size
to = from + group_size - 1
else
rest = 0
to = from + rest - 1
end
if debug_mode then
debug("spawn a thread checking ",
is_backup and "backup" or "primary", " peers ",
from - 1, " to ", to - 1)
end
threads[i] = spawn(check_peer_range, ctx, from, to, peers,
is_backup)
from = from + group_size
if rest == 0 then
break
end
end
if rest > 0 then
local to = from + rest - 1
if debug_mode then
debug("check ", is_backup and "backup" or "primary",
" peers ", from - 1, " to ", to - 1)
end
check_peer_range(ctx, from, to, peers, is_backup)
end
end
if nthr and nthr > 0 then
for i = 1, nthr do
local t = threads[i]
if t then
wait(t)
end
end
end
end
end
local function upgrade_peers_version(ctx, peers, is_backup)
local dict = ctx.dict
local u = ctx.upstream
local n = #peers
for i = 1, n do
local peer = peers[i]
local id = i - 1
local key = gen_peer_key("d:", u, is_backup, id)
local down = false
local res, err = dict:get(key)
if not res then
if err then
errlog("failed to get peer down state: ", err)
end
else
down = true
end
if (peer.down and not down) or (not peer.down and down) then
local ok, err = set_peer_down(u, is_backup, id, down)
if not ok then
errlog("failed to set peer down: ", err)
else
-- update our cache too
peer.down = down
end
end
end
end
local function check_peers_updates(ctx)
local dict = ctx.dict
local u = ctx.upstream
local key = "v:" .. u
local ver, err = dict:get(key)
if not ver then
if err then
errlog("failed to get peers version: ", err)
return
end
if ctx.version > 0 then
ctx.new_version = true
end
elseif ctx.version < ver then
debug("upgrading peers version to ", ver)
upgrade_peers_version(ctx, ctx.primary_peers, false);
upgrade_peers_version(ctx, ctx.backup_peers, true);
ctx.version = ver
end
end
local function get_lock(ctx)
local dict = ctx.dict
local key = "l:" .. ctx.upstream
-- the lock is held for the whole interval to prevent multiple
-- worker processes from sending the test request simultaneously.
-- here we substract the lock expiration time by 1ms to prevent
-- a race condition with the next timer event.
local ok, err = dict:add(key, true, ctx.interval - 0.001)
if not ok then
if err == "exists" then
return nil
end
errlog("failed to add key \"", key, "\": ", err)
return nil
end
return true
end
local function do_check(ctx)
debug("healthcheck: run a check cycle")
check_peers_updates(ctx)
if get_lock(ctx) then
check_peers(ctx, ctx.primary_peers, false)
check_peers(ctx, ctx.backup_peers, true)
end
if ctx.new_version then
local key = "v:" .. ctx.upstream
local dict = ctx.dict
if debug_mode then
debug("publishing peers version ", ctx.version + 1)
end
dict:add(key, 0)
local new_ver, err = dict:incr(key, 1)
if not new_ver then
errlog("failed to publish new peers version: ", err)
end
ctx.version = new_ver
ctx.new_version = nil
end
end
local function update_upstream_checker_status(upstream, success)
local cnt = upstream_checker_statuses[upstream]
if not cnt then
cnt = 0
end
if success then
cnt = cnt + 1
else
cnt = cnt - 1
end
upstream_checker_statuses[upstream] = cnt
end
local check
check = function(premature, ctx)
if premature then
return
end
local ok, err = pcall(do_check, ctx)
if not ok then
errlog("failed to run healthcheck cycle: ", err)
end
local ok, err = new_timer(ctx.interval, check, ctx)
if not ok then
if err ~= "process exiting" then
errlog("failed to create timer: ", err)
end
update_upstream_checker_status(ctx.upstream, false)
return
end
end
local function preprocess_peers(peers, port)
local n = #peers
for i = 1, n do
local p = peers[i]
local name = p.name
if name then
local from, to, err = re_find(name, [[^(.*):\d+$]], "jo", nil, 1)
if from then
p.host = sub(name, 1, to)
p.port = port or tonumber(sub(name, to + 2))
end
end
end
return peers
end
function _M.spawn_checker(opts)
local typ = opts.type
if not typ then
return nil, "\"type\" option required"
end
if typ ~= "http" and typ ~= "https" then
return nil, "only \"http\" and \"https\" type are supported right now"
end
local ssl_verify = opts.ssl_verify
if ssl_verify == nil then
ssl_verify = true
end
local http_req = opts.http_req
if not http_req then
return nil, "\"http_req\" option required"
end
local timeout = opts.timeout
if not timeout then
timeout = 1000
end
local interval = opts.interval
if not interval then
interval = 1
else
interval = interval / 1000
if interval < 0.002 then -- minimum 2ms
interval = 0.002
end
end
local valid_statuses = opts.valid_statuses
local statuses
if valid_statuses then
statuses = new_tab(0, #valid_statuses)
for _, status in ipairs(valid_statuses) do
-- print("found good status ", status)
statuses[status] = true
end
end
-- debug("interval: ", interval)
local concur = opts.concurrency
if not concur then
concur = 1
end
local fall = opts.fall
if not fall then
fall = 5
end
local rise = opts.rise
if not rise then
rise = 2
end
local shm = opts.shm
if not shm then
return nil, "\"shm\" option required"
end
local dict = shared[shm]
if not dict then
return nil, "shm \"" .. tostring(shm) .. "\" not found"
end
local u = opts.upstream
if not u then
return nil, "no upstream specified"
end
local ppeers, err = get_primary_peers(u)
if not ppeers then
return nil, "failed to get primary peers: " .. err
end
local bpeers, err = get_backup_peers(u)
if not bpeers then
return nil, "failed to get backup peers: " .. err
end
local ctx = {
upstream = u,
primary_peers = preprocess_peers(ppeers, opts.port),
backup_peers = preprocess_peers(bpeers, opts.port),
http_req = http_req,
timeout = timeout,
interval = interval,
dict = dict,
fall = fall,
rise = rise,
statuses = statuses,
version = 0,
concurrency = concur,
type = typ,
host = opts.host,
ssl_verify = ssl_verify
}
if debug_mode and opts.no_timer then
check(nil, ctx)
else
local ok, err = new_timer(0, check, ctx)
if not ok then
return nil, "failed to create timer: " .. err
end
end
update_upstream_checker_status(u, true)
return true
end
local new_status_meta = {
__add = function(self, rhs)
-- debug("new_status_meta:__add: rhs: ", rhs)
self.statuses[self.idx] = rhs
self.idx = self.idx + 1
end
}
new_status_meta.__index = new_status_meta
function new_status_meta:add(rhs)
self:__add(rhs)
end
local function new_status_table(n)
local tab = {statuses = new_tab(n * 90, 0), idx = 1}
return setmetatable(tab, new_status_meta)
end
-- combined upstream status adding functions
local function add_upstream_prometheus_status_line(tab, u, st)
tab:add('nginx_upstream_status_info{name="')
tab:add(u)
tab:add('",status="')
tab:add(st)
tab:add('\n')
end
local function add_upstream_up_prometheus_status(tab, u)
add_upstream_prometheus_status_line(tab, u, 'UP"} 1');
add_upstream_prometheus_status_line(tab, u, 'DOWN"} 0');
add_upstream_prometheus_status_line(tab, u, 'UNKNOWN"} 0');
end
local function add_upstream_down_prometheus_status(tab, u)
add_upstream_prometheus_status_line(tab, u, 'UP"} 0');
add_upstream_prometheus_status_line(tab, u, 'DOWN"} 1');
add_upstream_prometheus_status_line(tab, u, 'UNKNOWN"} 0');
end
local function add_upstream_unknown_prometheus_status(tab, u)
add_upstream_prometheus_status_line(tab, u, 'UP"} 0');
add_upstream_prometheus_status_line(tab, u, 'DOWN"} 0');
add_upstream_prometheus_status_line(tab, u, 'UNKNOWN"} 1');
end
-- peer status generator functions
local function gen_peer_prometheus_status(tab, u, p, r, s, n)
tab:add("nginx_upstream_status_info{name=\"")
tab:add(u)
tab:add("\",endpoint=\"")
tab:add(p)
tab:add("\",status=\"")
tab:add(s)
tab:add("\",role=\"")
tab:add(r)
tab:add("\"} ")
tab:add(n)
tab:add("\n")
end
-- combined peer status adding function
local function add_peer_status(tab, u, p, r)
gen_peer_prometheus_status(tab, u, p.name, r, "UP", not p.down and 1 or 0)
gen_peer_prometheus_status(tab, u, p.name, r, "DOWN", p.down and 1 or 0)
end
local function add_peer_prometheus_status(tab, u, p, r)
gen_peer_prometheus_status(tab, u, p.name, r, "UP", not p.down and 1 or 0)
gen_peer_prometheus_status(tab, u, p.name, r, "DOWN", p.down and 1 or 0)
end
local function add_peers_info(tab, u, peers, role)
local npeers = #peers
for i = 1, npeers do
local peer = peers[i]
tab:add(" ")
tab:add(peer.name)
if peer.down then
tab:add(" DOWN\n")
else
tab:add(" UP\n")
end
end
end
local function add_peers_prometheus_info(tab, u, peers, role)
local npeers = #peers
local found_up_peer = false
for i = 1, npeers do
add_peer_prometheus_status(tab, u, peers[i], role)
if not peers[i].down then
found_up_peer = true
end
end
return found_up_peer
end
function _M.prometheus_status_page()
-- generate an prometheus metrics
-- # HELP nginx_upstream_status_info The running staus of nginx upstream
-- # TYPE nginx_upstream_status_info gauge
-- nginx_upstream_status_info{name="",endpoint="",status="",role=""} num
local us, err = get_upstreams()
if not us then
return nil, "failed to get upstream names: " .. err
end
local n = #us
local stats_tab = new_status_table(n)
stats_tab:add(
"# HELP nginx_upstream_status_info The running status of nginx upstream\n")
stats_tab:add("# TYPE nginx_upstream_status_info gauge\n")
for i = 1, n do
local u = us[i]
local ncheckers = upstream_checker_statuses[u]
if not ncheckers or ncheckers == 0 then
add_upstream_unknown_prometheus_status(stats_tab, u)
goto continue
end
local peers, err = get_primary_peers(u)
if not peers then
add_upstream_down_prometheus_status(stats_tab, u)
else
local peers, err = get_primary_peers(u)
local found_up_peer = false
if peers then
if add_peers_prometheus_info(stats_tab, u, peers, "PRIMARY") then
found_up_peer = true
end
end
peers, err = get_backup_peers(u)
if peers then
if add_peers_prometheus_info(stats_tab, u, peers, "BACKUP") then
found_up_peer = true
end
end
if found_up_peer then
add_upstream_up_prometheus_status(stats_tab, u)
else
add_upstream_down_prometheus_status(stats_tab, u)
end
end
::continue::
end
return concat(stats_tab.statuses)
end
function _M.status_page()
-- generate an HTML page
local us, err = get_upstreams()
if not us then
return "failed to get upstream names: " .. err
end
local n = #us
local stats_tab = new_status_table(n)
for i = 1, n do
if i > 1 then
stats_tab:add("\n")
end
local u = us[i]
stats_tab:add("Upstream ")
stats_tab:add(u)
local ncheckers = upstream_checker_statuses[u]
if not ncheckers or ncheckers == 0 then
stats_tab:add(" (NO checkers)")
end
stats_tab:add("\n Primary Peers\n")
local peers, err = get_primary_peers(u)
if not peers then
return
"failed to get primary peers in upstream " .. u .. ": " .. err
end
add_peers_info(stats_tab, u, peers, "PRIMARY")
stats_tab:add(" Backup Peers\n")
peers, err = get_backup_peers(u)
if not peers then
return "failed to get backup peers in upstream " .. u .. ": " .. err
end
add_peers_info(stats_tab, u, peers, "BACKUP")
end
return concat(stats_tab.statuses)
end
return _M