File: //proc/1/task/1/root/usr/share/rspamd/plugins/url_redirector.lua
--[[
Copyright (c) 2022, Vsevolod Stakhov <vsevolod@rspamd.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
]]--
if confighelp then
return
end
local rspamd_logger = require "rspamd_logger"
local rspamd_http = require "rspamd_http"
local hash = require "rspamd_cryptobox_hash"
local rspamd_url = require "rspamd_url"
local lua_util = require "lua_util"
local lua_redis = require "lua_redis"
local N = "url_redirector"
-- Coherent browser fingerprint profiles.
--
-- The url_redirector resolves shortened/redirector URLs by issuing HTTP
-- requests. Sites that cloak (serve different content to bots) commonly
-- key on a missing or inconsistent header set, so a lone User-Agent
-- string is the weakest possible disguise. Each profile instead bundles
-- a User-Agent with the exact header set, values and order that the
-- matching real browser sends, keeping the request internally consistent
-- (e.g. Chrome carries `sec-ch-ua` client hints; Firefox and Safari do
-- not).
--
-- `headers` is an ordered list of {name, value} pairs. rspamd_http keeps
-- this order on the wire (RSPAMD_HTTP_FLAG_ORDERED_HEADERS); the Host
-- header and request line are emitted by the HTTP client itself. One
-- profile is picked per task so every hop of every chain shares a single
-- identity, the way a real browser would.
-- The Accept header all Chromium-based browsers send on a navigation.
local chromium_accept = 'text/html,application/xhtml+xml,' ..
'application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,' ..
'application/signed-exchange;v=b3;q=0.7'
local default_profiles = {
{
name = 'chrome_win',
headers = {
{ 'Connection', 'keep-alive' },
{ 'sec-ch-ua', '"Not)A;Brand";v="8", "Chromium";v="148", "Google Chrome";v="148"' },
{ 'sec-ch-ua-mobile', '?0' },
{ 'sec-ch-ua-platform', '"Windows"' },
{ 'Upgrade-Insecure-Requests', '1' },
{ 'User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36' },
{ 'Accept', chromium_accept },
{ 'Sec-Fetch-Site', 'none' },
{ 'Sec-Fetch-Mode', 'navigate' },
{ 'Sec-Fetch-User', '?1' },
{ 'Sec-Fetch-Dest', 'document' },
{ 'Accept-Encoding', 'gzip, deflate, br, zstd' },
{ 'Accept-Language', 'en-US,en;q=0.9' },
},
},
{
name = 'chrome_mac',
headers = {
{ 'Connection', 'keep-alive' },
{ 'sec-ch-ua', '"Not)A;Brand";v="8", "Chromium";v="148", "Google Chrome";v="148"' },
{ 'sec-ch-ua-mobile', '?0' },
{ 'sec-ch-ua-platform', '"macOS"' },
{ 'Upgrade-Insecure-Requests', '1' },
{ 'User-Agent',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36' },
{ 'Accept', chromium_accept },
{ 'Sec-Fetch-Site', 'none' },
{ 'Sec-Fetch-Mode', 'navigate' },
{ 'Sec-Fetch-User', '?1' },
{ 'Sec-Fetch-Dest', 'document' },
{ 'Accept-Encoding', 'gzip, deflate, br, zstd' },
{ 'Accept-Language', 'en-US,en;q=0.9' },
},
},
{
name = 'edge_win',
headers = {
{ 'Connection', 'keep-alive' },
{ 'sec-ch-ua', '"Not)A;Brand";v="8", "Chromium";v="148", "Microsoft Edge";v="148"' },
{ 'sec-ch-ua-mobile', '?0' },
{ 'sec-ch-ua-platform', '"Windows"' },
{ 'Upgrade-Insecure-Requests', '1' },
{ 'User-Agent',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/148.0.0.0 Safari/537.36 Edg/148.0.0.0' },
{ 'Accept', chromium_accept },
{ 'Sec-Fetch-Site', 'none' },
{ 'Sec-Fetch-Mode', 'navigate' },
{ 'Sec-Fetch-User', '?1' },
{ 'Sec-Fetch-Dest', 'document' },
{ 'Accept-Encoding', 'gzip, deflate, br, zstd' },
{ 'Accept-Language', 'en-US,en;q=0.9' },
},
},
{
-- Firefox sends no sec-ch-ua client hints and uses a different
-- header order and Accept set than Chromium.
name = 'firefox_win',
headers = {
{ 'User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:150.0) Gecko/20100101 Firefox/150.0' },
{ 'Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' },
{ 'Accept-Language', 'en-US,en;q=0.5' },
{ 'Accept-Encoding', 'gzip, deflate, br, zstd' },
{ 'Connection', 'keep-alive' },
{ 'Upgrade-Insecure-Requests', '1' },
{ 'Sec-Fetch-Dest', 'document' },
{ 'Sec-Fetch-Mode', 'navigate' },
{ 'Sec-Fetch-Site', 'none' },
{ 'Sec-Fetch-User', '?1' },
{ 'Priority', 'u=0, i' },
},
},
{
-- Safari also omits sec-ch-ua and sends a leaner header set.
name = 'safari_mac',
headers = {
{ 'Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' },
{ 'Accept-Encoding', 'gzip, deflate, br' },
{ 'Connection', 'keep-alive' },
{ 'User-Agent',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/26.5 Safari/605.1.15' },
{ 'Accept-Language', 'en-US,en;q=0.9' },
{ 'Sec-Fetch-Site', 'none' },
{ 'Sec-Fetch-Mode', 'navigate' },
{ 'Sec-Fetch-Dest', 'document' },
},
},
}
local redis_params
local settings = {
expire = 86400, -- 1 day by default
timeout = 8, -- total timeout of module
-- HTTP HEAD timeout per redirect hop. Either a number (whole-request
-- duration) or a table with .connect_timeout, .ssl_timeout,
-- .write_timeout, .read_timeout for granular control.
http_timeout = 4,
redis_timeout = 2, -- redis timeout for cache operations (redis.conf module has higher priority)
nested_limit = 2, -- how many redirects to follow
--proxy = "http://example.com:3128", -- send request through proxy, not yet implemented
key_prefix = 'rdr:', -- default hash name
check_ssl = false, -- check ssl certificates
max_urls = 5, -- how many urls to check (CTA checked in first place)
max_size = 10 * 1024, -- maximum body to process
-- Optional operator override. When set (a string, or a list of
-- strings picked at random) the module sends a single User-Agent
-- header and skips fingerprint profiles entirely. Leave unset to use
-- the coherent browser profiles below.
user_agent = nil,
-- Browser fingerprint profiles used when user_agent is not set.
fingerprint_profiles = default_profiles,
redirector_symbol = nil, -- insert symbol if redirected url has been found
redirector_symbol_nested = "URL_REDIRECTOR_NESTED", -- insert symbol if nested limit has been reached
redirector_symbol_non_http = "URL_REDIRECTOR_NON_HTTP", -- HTTP -> non-HTTP(S) redirect detected
redirectors_only = true, -- follow merely redirectors
top_urls_key = 'rdr:top_urls', -- key for top urls
top_urls_count = 200, -- how many top urls to save
redirector_hosts_map = nil, -- check only those redirectors
redirector_get_urls_map = nil, -- list of regex patterns for which GET should be used instead of HEAD
-- inject intermediate redirect hops into the task
save_intermediate_redirs = {
redirectors = false,
non_redirectors = true, -- inject non-redirector hops by default since they can hide cloaker phishing urls
}
}
-- Spread http_timeout into the kwargs of an rspamd_http.request{} call:
-- 'timeout' for the number form, individual fields for the table form.
local function apply_http_timeout(http_params)
local t = settings.http_timeout
if type(t) == 'table' then
http_params.connect_timeout = t.connect_timeout
http_params.ssl_timeout = t.ssl_timeout
http_params.write_timeout = t.write_timeout
http_params.read_timeout = t.read_timeout
else
http_params.timeout = t
end
end
--[[
Encode characters that are not allowed in URLs according to RFC 3986
This is needed because redirect Location headers sometimes contain unencoded spaces
and other special characters that http_parser_parse_url() doesn't accept.
Only encodes the truly problematic characters (space, control chars, etc.)
]]
local function encode_url_for_redirect(url_str)
if not url_str then
return nil
end
-- Encode space and other problematic characters that are common in redirect URLs
-- We're conservative - only encode what http_parser_parse_url actually rejects
-- Don't encode already-encoded sequences (%XX)
-- Use explicit ASCII ranges instead of %w which is locale-dependent
local encoded = url_str:gsub("([^A-Za-z0-9%-%._~:/?#%[%]@!$&'()*+,;=%%])", function(c)
-- Don't double-encode already encoded characters
if c == '%' then
return c
end
return string.format("%%%02X", string.byte(c))
end)
return encoded
end
-- Build a 'host1->host2->...' string from a chain of URL objects.
-- Includes scheme for non-HTTP(S) URLs to distinguish them.
local function chain_hosts_string(chain)
local hosts = {}
for i = 1, #chain do
local proto = chain[i]:get_protocol()
if proto ~= 'http' and proto ~= 'https' then
hosts[i] = chain[i]:get_text()
else
hosts[i] = chain[i]:get_host()
end
end
return table.concat(hosts, '->')
end
-- Mixed into the hashed cache key; bump on incompatible value-format changes
-- so old-format entries hash elsewhere and get re-resolved, not misread.
-- v1: value is the raw (percent-encoded) URL, not the decoded text.
local cache_format_version = 'v1:'
-- Per-URL Redis cache key: hash the URL (fixed-length, URL-safe). tostring()
-- (not get_raw) keeps the hash stable across the write-then-read round-trip.
local function cache_key_for_url(url_str)
return settings.key_prefix ..
hash.create(cache_format_version .. url_str):base32():sub(1, 32)
end
-- Whether an intermediate hop should be saved (in cache and task URL set)
-- given the per-class gates in settings.save_intermediate_redirs. Hops on
-- redirector_hosts_map are gated by .redirectors; everything else by
-- .non_redirectors -- the latter is where rotator/cloaker hosts surface.
local function should_save_hop(hop_url)
if not hop_url then
return false
end
local host = hop_url:get_host()
local is_redirector = false
if host and settings.redirector_hosts_map
and settings.redirector_hosts_map:get_key(host) then
is_redirector = true
end
local cfg = settings.save_intermediate_redirs
if is_redirector then
return cfg.redirectors and true or false
end
return cfg.non_redirectors and true or false
end
-- Append hop to chain unless it equals the current tail. String
-- comparison (not identity): on cache-hit walks the parsed URL is a
-- fresh Lua object for the same string, and identity (==) would
-- falsely register a self-loop as two distinct hops.
local function chain_append(chain, hop_url)
if not hop_url then
return
end
local tail = chain[#chain]
if tail == nil or tostring(hop_url) ~= tostring(tail) then
table.insert(chain, hop_url)
end
end
-- Apply a finalized chain to the task: link adjacent pairs via
-- set_redirected, inject every non-orig hop as a task URL, and emit
-- redirector_symbol with hosts joined by '->'. Length-1 chain (no
-- redirect happened) is a no-op.
local function apply_redirect_chain(task, chain)
if #chain < 2 then
return
end
local mempool = task:get_mempool()
for i = 1, #chain - 1 do
chain[i]:set_redirected(chain[i + 1], mempool)
end
for i = 2, #chain do
local proto = chain[i]:get_protocol()
if proto == 'http' or proto == 'https' then
task:inject_url(chain[i])
end
end
if settings.redirector_symbol then
task:insert_result(settings.redirector_symbol, 1.0,
chain_hosts_string(chain))
end
end
-- Persist a finalized chain to Redis as one SETEX per adjacent pair where
-- the value is the next hop. Non-terminal links carry a '^hop:' marker so
-- the reader keeps walking; the terminal link carries terminal_prefix
-- (currently 'nested') if the chain didn't fully resolve, otherwise no
-- marker. ZINCRBY counts the canonical URL string with no marker so the
-- top_urls zset stays a meaningful popularity counter.
-- A length-1 chain caches a self-loop so future scans of a direct-200
-- URL fast-path through the cache walk instead of re-issuing HEAD.
local function cache_chain_to_redis(task, chain, terminal_prefix)
if #chain == 0 then
return
end
local function trim_cb(err, _)
if err then
rspamd_logger.errx(task, 'got error trimming top urls set: %s', err)
else
rspamd_logger.infox(task, 'trimmed top urls set to %s elements',
settings.top_urls_count)
end
end
local function card_cb(err, data)
if err then
rspamd_logger.errx(task, 'got error reading top urls cardinality: %s', err)
return
end
if data and tonumber(data) and tonumber(data) > settings.top_urls_count * 2 then
local ret = lua_redis.redis_make_request(task,
redis_params, settings.top_urls_key, true, trim_cb,
'ZREMRANGEBYRANK',
{ settings.top_urls_key, '0',
tostring(-(settings.top_urls_count + 1)) })
if not ret then
rspamd_logger.errx(task, 'cannot trim top urls set')
end
end
end
local function set_cb(err, _)
if err then
rspamd_logger.errx(task, 'got error caching redirect link: %s', err)
end
end
local function write_link(prev_url, next_url, marker)
local link_key = cache_key_for_url(tostring(prev_url))
-- Cache the raw (percent-encoded) form: keeps query boundaries intact so
-- cached hops re-parse identically to live ones, and it is already URL-safe.
local next_str = next_url:get_raw()
local cache_value
if marker then
cache_value = string.format('^%s:%s', marker, next_str)
else
cache_value = next_str
end
local ret, conn, _ = lua_redis.redis_make_request(task,
redis_params, link_key, true, set_cb,
'SETEX', { link_key, tostring(settings.expire), cache_value })
if not ret then
rspamd_logger.errx(task, 'cannot cache redirect link for %s', prev_url)
elseif conn then
conn:add_cmd('ZINCRBY', { settings.top_urls_key, '1', next_str })
end
end
if #chain == 1 then
write_link(chain[1], chain[1], terminal_prefix)
else
for i = 1, #chain - 1 do
local marker
if i == #chain - 1 then
marker = terminal_prefix
else
marker = 'hop'
end
write_link(chain[i], chain[i + 1], marker)
end
end
-- One trim probe per finalized chain rather than per link.
local ret = lua_redis.redis_make_request(task,
redis_params, settings.top_urls_key, false, card_cb,
'ZCARD', { settings.top_urls_key })
if not ret then
rspamd_logger.errx(task, 'cannot probe top urls cardinality')
end
end
-- Apply chain to task and persist it to Redis.
local function finalize_chain(task, chain, terminal_prefix)
apply_redirect_chain(task, chain)
cache_chain_to_redis(task, chain, terminal_prefix)
end
-- HTTP redirect status codes that we follow.
local redirection_codes = {
[301] = true, -- moved permanently
[302] = true, -- found
[303] = true, -- see other
[307] = true, -- temporary redirect
[308] = true, -- permanent redirect
}
-- step (cache walk) and http_walk (live HEAD) are mutually recursive:
-- step bridges to http_walk on '^nested' to extend a partially-resolved
-- chain; http_walk splices into step on a 30x whose redirect target has
-- a cached chain (saves redundant HEADs across emails that share an
-- intermediate). Forward-declare so each can name the other.
local step
local http_walk
-- Terminal exit for step(): write back if we extended via HTTP this scan,
-- else just apply. Hoisted as a free function so step()'s recursive cache
-- hops don't allocate a fresh closure per call.
local function step_finish(task, chain, http_extended, terminal_prefix)
if http_extended then
finalize_chain(task, chain, terminal_prefix)
else
apply_redirect_chain(task, chain)
end
end
-- Walk a cached redirect chain. data is the Redis value for
-- hash(chain[#chain]); pass nil to issue the GET. seen is the shared
-- per-scan URL-string set (cache walk and http_walk write to it so
-- cycles crossing both layers are caught with one extra Redis GET at
-- worst). ntries is the count of HTTP HEADs already issued in this
-- scan -- threaded through cache hops without change so the
-- ^nested-bridge below hands http_walk the correct remaining budget,
-- not a fresh nested_limit. Defaults to 0 for top-level cache walks.
--
-- http_extended (default false): set to true when step() was entered
-- via an http_walk splice (the chain has live-resolved entries that
-- aren't in cache yet). At terminal/exit paths we then call
-- finalize_chain (which writes back via cache_chain_to_redis) instead
-- of just apply_redirect_chain, so the new chain links get persisted.
-- For top-level cache walks (no HTTP this scan) we keep the cheap
-- apply-only path to avoid redundant SETEX traffic.
step = function(task, orig_url, chain, seen, data, ntries, http_extended)
ntries = ntries or 0
http_extended = http_extended or false
if data == nil then
local last = chain[#chain]
local last_str = tostring(last)
local next_key = cache_key_for_url(last_str)
local ret = lua_redis.redis_make_request(task,
redis_params, next_key, false,
function(e, d)
if e then
rspamd_logger.errx(task,
'redis error during chain walk at %s: %s', last_str, e)
step_finish(task, chain, http_extended)
elseif d == 'processing' then
-- Another worker is currently resolving this hop; their write
-- will populate the cache when they finish. Apply what we have
-- and don't duplicate their HTTP work.
lua_util.debugm(N, task,
'cache lock at %s mid-walk, applying partial chain', last_str)
step_finish(task, chain, http_extended)
elseif type(d) ~= 'string' then
-- True cache miss mid-walk: a previous chain link points to a
-- URL whose own cache entry is gone (TTL expired or evicted).
-- Resume live HTTP from this dead end so the chain rebuilds and
-- gets re-cached, instead of giving up with a truncated chain.
lua_util.debugm(N, task,
'cache miss for %s mid-walk, extending with live HTTP', last_str)
-- The prior ^hop iteration that appended `last` to the chain
-- set seen[last_str]=true; http_walk re-marks it on entry, so
-- clear here to avoid false-firing http_walk's cycle guard on
-- the very URL we're bridging to.
seen[last_str] = nil
http_walk(task, orig_url, last, ntries + 1, chain, seen)
else
step(task, orig_url, chain, seen, d, ntries, http_extended)
end
end,
'GET', { next_key })
if not ret then
rspamd_logger.errx(task, 'cannot make redis request to walk chain')
step_finish(task, chain, http_extended)
end
return
end
local prefix, val = nil, data
if data:sub(1, 1) == '^' then
local p, v = data:match('^%^([%w_]+):(.+)$')
if p then
prefix, val = p, v
end
end
if seen[val] then
lua_util.debugm(N, task, 'cycle in cached chain at %s', val)
step_finish(task, chain, http_extended)
return
end
local hop = rspamd_url.create(task:get_mempool(), val,
{ 'redirect_target' })
if not hop then
step_finish(task, chain, http_extended)
return
end
chain_append(chain, hop)
seen[val] = true
if prefix == 'hop' then
step(task, orig_url, chain, seen, nil, ntries, http_extended)
return
end
if prefix == 'nested' then
-- Cached walk ended on "we ran out of HTTP budget last time".
-- Hand off to http_walk for a live extension. ntries+1 is the
-- index of the next HEAD in this scan -- not 1 -- so any HEADs
-- already done before the cache splice still count toward
-- nested_limit. If the extension finalizes successfully, the
-- upstream ^nested marker is rewritten as ^hop and the chain
-- grows in cache.
lua_util.debugm(N, task,
'extending past cached ^nested:%s with live HTTP', val)
http_walk(task, orig_url, hop, ntries + 1, chain, seen)
return
end
if prefix == 'non_http' then
local rscheme = hop:get_protocol() or val:match('^([^:]+)')
-- chain already includes hop (appended via chain_append above)
task:insert_result(settings.redirector_symbol_non_http, 1.0,
string.format('%s=%s', rscheme, chain_hosts_string(chain)))
step_finish(task, chain, http_extended, 'non_http')
return
end
-- Plain terminal: chain fully resolved, apply (and persist if extended).
step_finish(task, chain, http_extended)
end
-- Live HTTP HEAD walk. ntries counts only HTTP requests; the cache walk
-- (step()) does not consume this budget. Bounded by settings.nested_limit.
-- On any terminal -- 200, network error, non-redirector under
-- redirectors_only=true, non-30x non-200, or failed Location parse --
-- finalize the chain. On nested_limit exhaustion, finalize with
-- terminal_prefix='nested' so the cache marks the tail with ^nested and
-- a future scan can pick up from there with a fresh HTTP budget.
--
-- Before recursing on a 30x's redirect target, probe the cache: shared
-- intermediates (e.g. multiple shortlinks all funneling through one
-- redirector host) get walked via step() instead of duplicate HEADs.
http_walk = function(task, orig_url, url, ntries, chain, seen)
if ntries > settings.nested_limit then
lua_util.debugm(N, task,
'cannot get more http requests to resolve %s, stop on %s after %s attempts',
orig_url, url, ntries)
chain_append(chain, url)
finalize_chain(task, chain, 'nested')
task:insert_result(settings.redirector_symbol_nested, 1.0,
string.format('%s:%d', chain_hosts_string(chain), ntries))
return
end
-- Mirror the cache walk's cycle guard: a redirector loop A->B->A->B
-- (e.g. login redirector flapping between two hosts) would otherwise
-- chew through nested_limit and bloat the chain with alternating
-- entries. tostring() (not get_raw): the cycle guard, cache key and
-- GET-map match need a stable identity that collapses encoding variants;
local url_str = tostring(url)
if seen[url_str] then
lua_util.debugm(N, task, 'cycle in http walk at %s', url_str)
finalize_chain(task, chain, nil)
return
end
seen[url_str] = true
local function http_callback(err, code, _, headers)
if err then
rspamd_logger.infox(task,
'found redirect error from %s to %s, err message: %s',
orig_url, url, err)
chain_append(chain, url)
finalize_chain(task, chain, nil)
return
end
if code == 200 then
if orig_url == url then
rspamd_logger.infox(task, 'url %s resolved directly (HTTP 200)', url)
else
rspamd_logger.infox(task,
'redirect chain resolved: %s -> %s (HTTP 200)', orig_url, url)
end
chain_append(chain, url)
finalize_chain(task, chain, nil)
return
end
if redirection_codes[code] then
local loc = headers['location']
local redir_url
if loc then
-- Encode problematic characters (spaces, etc.) that
-- http_parser doesn't accept. Fixes issue #5525.
local encoded_loc = encode_url_for_redirect(loc)
redir_url = rspamd_url.create(task:get_mempool(), encoded_loc)
if not redir_url and encoded_loc ~= loc then
rspamd_logger.infox(task,
'failed to parse redirect location even after encoding: %s', loc)
end
end
lua_util.debugm(N, task, 'redirect from %s to %s (HTTP %s)',
orig_url, loc, code)
-- 'url' just returned 30x, so it's an intermediate. Save it
-- only when gating allows. When extending past a cached
-- ^nested marker, url is the cached terminal that step() just
-- appended to chain -- in both cases it's already the tail.
if should_save_hop(url) then
chain_append(chain, url)
end
if redir_url then
local rscheme = redir_url:get_protocol()
if rscheme ~= 'http' and rscheme ~= 'https' then
lua_util.debugm(N, task, 'stop resolving redirects: %s has non-http(s) scheme %s', loc, rscheme)
chain_append(chain, redir_url)
task:insert_result(settings.redirector_symbol_non_http, 1.0,
string.format('%s=%s', rscheme, chain_hosts_string(chain)))
finalize_chain(task, chain, 'non_http')
return
end
local should_follow
if settings.redirectors_only then
should_follow = settings.redirector_hosts_map:get_key(redir_url:get_host()) ~= nil
else
should_follow = true
end
if should_follow then
-- Probe cache for redir_url before HEADing it. If a chain is
-- already cached at hash(redir_url) (typical when many
-- shortlinks share a redirector intermediate, or when a prior
-- scan resolved redir_url as its own orig), splice into step
-- and let the cache walk continue from there. Cache miss/lock:
-- fall back to live HEAD as before.
local k = cache_key_for_url(tostring(redir_url))
local ret = lua_redis.redis_make_request(task,
redis_params, k, false,
function(probe_err, probe_data)
if not probe_err
and type(probe_data) == 'string'
and probe_data ~= 'processing' then
lua_util.debugm(N, task,
'cache hit on redirect target %s, splicing into cache walk',
redir_url)
chain_append(chain, redir_url)
seen[tostring(redir_url)] = true
-- Pass current ntries so any onward ^nested-bridge
-- inside step counts HEADs already done in this
-- scan toward nested_limit, instead of resetting.
-- http_extended=true so step's terminal path will
-- finalize_chain (cache the newly-resolved live link
-- from this http_walk to redir_url, otherwise the
-- 'processing' marker at hash(orig_url) is never
-- replaced with the actual chain).
step(task, orig_url, chain, seen, probe_data, ntries, true)
else
http_walk(task, orig_url, redir_url, ntries + 1, chain, seen)
end
end,
'GET', { k })
if not ret then
rspamd_logger.errx(task,
'cannot probe cache for redirect target, falling through to HEAD')
http_walk(task, orig_url, redir_url, ntries + 1, chain, seen)
end
else
lua_util.debugm(N, task,
'stop resolving redirects as %s is not a redirector', loc)
chain_append(chain, redir_url)
finalize_chain(task, chain, nil)
end
elseif loc then
local raw_scheme = loc:match('^([A-Za-z][A-Za-z0-9+%-.]*):')
if raw_scheme and raw_scheme ~= 'http' and raw_scheme ~= 'https' then
lua_util.debugm(N, task, 'stop resolving redirects: %s has non-http(s) scheme %s (unparseable url)', loc, raw_scheme)
-- loc cannot be parsed into a URL object, so it cannot be appended to
-- chain or cached with a ^non_http marker. Emit the symbol now and cache
-- as a normal terminal; future scans within the TTL won't re-emit it.
task:insert_result(settings.redirector_symbol_non_http, 1.0,
string.format('%s=%s->%s', raw_scheme, chain_hosts_string(chain), loc))
finalize_chain(task, chain, nil)
else
lua_util.debugm(N, task, 'failed to parse location %s, headers: %s', loc, headers)
chain_append(chain, url)
finalize_chain(task, chain, nil)
end
else
lua_util.debugm(N, task, 'no location, headers: %s', headers)
chain_append(chain, url)
finalize_chain(task, chain, nil)
end
return
end
-- Other non-30x non-200 status: treat current url as terminal.
lua_util.debugm(N, task,
'found redirect error from %s to %s, err code: %s',
orig_url, url, code)
chain_append(chain, url)
finalize_chain(task, chain, nil)
end
local method = 'head'
if settings.redirector_get_urls_map
and settings.redirector_get_urls_map:get_key(url_str) then
method = 'get'
end
-- Request the raw (percent-encoded) URL: the decoded form would let a wrapper
-- mis-split its ?u=https%3A%2F%2F... target at the now-literal '&' and truncate.
local request_url = url:get_raw()
local http_params = {
url = request_url,
task = task,
method = method,
max_size = settings.max_size,
opaque_body = true,
no_ssl_verify = not settings.check_ssl,
callback = http_callback,
}
if settings.user_agent then
-- Operator override: a single User-Agent header, no fingerprint.
local ua = settings.user_agent
if type(ua) ~= 'string' then
ua = ua[math.random(#ua)]
end
http_params.headers = { ['User-Agent'] = ua }
lua_util.debugm(N, task, 'query %s %s with user agent %s',
method, url_str, ua)
else
-- Stealth: one coherent browser fingerprint per task, reused by
-- every hop of every chain so the identity stays consistent.
local profile = task:cache_get('url_redirector_profile')
if not profile then
local profiles = settings.fingerprint_profiles
if profiles and #profiles > 0 then
profile = profiles[math.random(#profiles)]
task:cache_set('url_redirector_profile', profile)
end
end
if profile then
http_params.headers = profile.headers
lua_util.debugm(N, task, 'query %s %s with %s fingerprint',
method, url_str, profile.name)
else
lua_util.debugm(N, task, 'query %s %s (no fingerprint profile)',
method, url_str)
end
end
apply_http_timeout(http_params)
rspamd_http.request(http_params)
end
-- Top-level entry: walk the cached chain from orig_url, then either
-- apply a fully-resolved chain to the task or hand off to http_walk
-- on cache miss / lock / partial walk.
--
-- Cache walks (step) are unbounded; only HTTP consumes nested_limit.
-- Cycle protection is a per-walk seen-set keyed by URL string that
-- both step and http_walk share, so cycles spanning the two are caught.
local function resolve_cached(task, orig_url)
local key = cache_key_for_url(tostring(orig_url))
local chain = { orig_url }
-- seen grows as we walk forward; we do not pre-seed it with orig_url
-- because the writer caches direct-200 URLs as a length-1 self-loop
-- (hash(orig) = tostring(orig)), and a pre-seed would false-fire the
-- cycle check on legitimate terminals. chain_append's tostring-eq
-- dedup keeps us from double-appending orig in that case.
local seen = {}
local function redis_get_cb(err, data)
if not err and type(data) == 'string' and data ~= 'processing' then
lua_util.debugm(N, task, 'found cached redirect from %s to %s',
orig_url, data)
-- Top-level cache hit: no HEADs done yet, so ntries=0 means a
-- ^nested-bridge later gets the full nested_limit budget.
step(task, orig_url, chain, seen, data, 0)
return
end
-- Cache miss or 'processing': try to claim the lock and live-resolve.
-- If SET NX fails (another scan holds the lock or a stale 'processing'
-- marker survives a crash), ndata != 'OK' and we drop this scan -- the
-- other holder will populate the cache, or the stale lock will expire
-- (EX = timeout + 1s) and the next scan will claim it.
local function redis_reserve_cb(nerr, ndata)
if nerr then
rspamd_logger.errx(task,
'got error while setting redirect keys: %s', nerr)
elseif ndata == 'OK' then
http_walk(task, orig_url, orig_url, 1, chain, seen)
else
lua_util.debugm(N, task,
'failed to claim lock for %s (held by another worker or stale processing marker, ndata=%s); skipping this scan',
orig_url, ndata)
end
end
local ret = lua_redis.redis_make_request(task,
redis_params, key, true, redis_reserve_cb,
'SET',
{ key, 'processing', 'EX',
tostring(math.floor(settings.timeout) + 1), 'NX' })
if not ret then
rspamd_logger.errx(task, "Couldn't schedule SET")
end
end
local ret = lua_redis.redis_make_request(task,
redis_params, key, false, redis_get_cb,
'GET', { key })
if not ret then
rspamd_logger.errx(task, 'cannot make redis request to check results')
end
end
local function url_redirector_process_url(task, url)
resolve_cached(task, url)
end
local function url_redirector_handler(task)
-- task:has_urls returns (bool, count) without materialising the URL
-- table; bail out cheaply when the message has no URLs at all so we
-- skip the CTA scan and extract_specific_urls call entirely.
local has_urls, n_urls = task:has_urls()
if not has_urls then
lua_util.debugm(N, task, 'no URLs in task, skipping redirector resolution')
return
end
local selected = {}
local seen = {}
for _, part in ipairs(task:get_text_parts()) do
if part:is_html() then
for _, url in ipairs(part:get_cta_urls(settings.max_urls, true)) do
local host = url:get_host()
if host and settings.redirector_hosts_map:get_key(host) then
local key = tostring(url)
if not seen[key] then
lua_util.debugm(N, task, 'prefer CTA url %s for redirector', key)
table.insert(selected, url)
seen[key] = true
if #selected >= settings.max_urls then
break
end
end
end
end
end
if #selected >= settings.max_urls then
break
end
end
local remaining = settings.max_urls - #selected
if remaining > 0 then
local sp_urls = lua_util.extract_specific_urls({
task = task,
limit = remaining,
filter = function(url)
-- task:get_urls()'s default protocol mask is HTTP|HTTPS|FILE|FTP.
-- We only follow HTTP(S); silently drop the rest at selection
-- rather than letting them reach http_walk and waste a HEAD
-- timeout. URL_REDIRECTOR_NON_HTTP is reserved for the case
-- where an HTTP redirect points at a non-HTTP scheme.
local proto = url:get_protocol()
if proto ~= 'http' and proto ~= 'https' then
return false
end
local host = url:get_host()
if host and settings.redirector_hosts_map:get_key(host) then
local key = tostring(url)
if not seen[key] then
lua_util.debugm(N, task, 'consider redirector url %s', key)
return true
end
end
return false
end,
no_cache = true,
need_content = true,
})
if sp_urls then
for _, u in ipairs(sp_urls) do
local key = tostring(u)
if not seen[key] then
table.insert(selected, u)
seen[key] = true
if #selected >= settings.max_urls then
break
end
end
end
end
end
if #selected == 0 then
lua_util.debugm(N, task,
'no URLs matched redirector_hosts_map (out of %d task URLs)',
n_urls)
end
for _, u in ipairs(selected) do
url_redirector_process_url(task, u)
end
end
local opts = rspamd_config:get_all_opt('url_redirector')
if opts then
settings = lua_util.override_defaults(settings, opts)
-- Pass redis_timeout to lua_redis instead of the symbol budget.
-- Nested redis{} block needs the override too -- parse_redis_server
-- reads opts.redis directly when present and never falls back to
-- opts.timeout.
local redis_opts = lua_util.shallowcopy(opts)
redis_opts.timeout = settings.redis_timeout
if redis_opts.redis then
redis_opts.redis = lua_util.shallowcopy(redis_opts.redis)
if not redis_opts.redis.timeout then
redis_opts.redis.timeout = settings.redis_timeout
end
end
redis_params = lua_redis.parse_redis_server('url_redirector', redis_opts)
if not redis_params then
rspamd_logger.infox(rspamd_config, 'no servers are specified, disabling module')
lua_util.disable_module(N, "redis")
else
if not settings.redirector_hosts_map then
rspamd_logger.infox(rspamd_config, 'no redirector_hosts_map option is specified, disabling module')
lua_util.disable_module(N, "config")
else
local lua_maps = require "lua_maps"
settings.redirector_hosts_map = lua_maps.map_add_from_ucl(settings.redirector_hosts_map,
'glob', 'Redirectors definitions (glob: bare names match exactly, *.foo matches subs)')
if settings.redirector_get_urls_map then
settings.redirector_get_urls_map = lua_maps.map_add_from_ucl(
settings.redirector_get_urls_map, 'regexp',
'URL redirector: URLs to fetch with GET instead of HEAD')
end
lua_redis.register_prefix(settings.key_prefix .. '[a-z0-9]{32}', N,
'URL redirector hashes', {
type = 'string',
})
if settings.top_urls_key then
lua_redis.register_prefix(settings.top_urls_key, N,
'URL redirector top urls', {
type = 'zlist',
})
end
local id = rspamd_config:register_symbol {
name = 'URL_REDIRECTOR_CHECK',
type = 'callback,prefilter',
priority = lua_util.symbols_priorities.medium,
callback = url_redirector_handler,
augmentations = { string.format("timeout=%f", settings.timeout) }
}
rspamd_config:register_symbol {
name = settings.redirector_symbol_nested,
type = 'virtual',
parent = id,
score = 0,
}
rspamd_config:register_symbol {
name = settings.redirector_symbol_non_http,
type = 'virtual',
parent = id,
score = 0,
}
if settings.redirector_symbol then
rspamd_config:register_symbol {
name = settings.redirector_symbol,
type = 'virtual',
parent = id,
score = 0,
}
end
end
end
end