Skip to content

Commit cb34f46

Browse files
committed
fix: geo_fanout reads the target status via Unlocker format:json (not the gateway 200)
The geo_fanout executor called the Unlocker /request with format:'raw' and then classified axios response.status, which is always the gateway's 200. A target 403/451/redirect was therefore misclassified as ok, defeating the whole point of the tool (surfacing geo-gating as a first-class classified result). Fix: call /request with format:'json'. The response body is the envelope {status_code, headers, body} where status_code is the TARGET's HTTP status and headers are the TARGET's response headers. A 3xx is surfaced WITHOUT the Unlocker following it, so a redirect keeps its Location header. We classify on that real target status. - geo_utils.js: add parse_unlocker_json(data) mapping {status_code, headers, body} to the {status, headers, body} shape build_geo_entry expects (tolerant of a missing/non-object/JSON-string input). build_geo_entry now carries the rendered body through and documents exit_ip as best-effort null (the json envelope has no gateway x-brd-* headers; we never fabricate an IP). - server.js: executor uses format:'json' + responseType:'json', parses with parse_unlocker_json, passes the target {status, headers} into build_geo_entry, and renders the body to markdown via the existing remark/strip pipeline when data_format is markdown (raw otherwise). Per-geo country targeting preserved. - tests: geo-utils fixtures now use the REAL Unlocker envelope shape and drive build_geo_entry end-to-end through parse_unlocker_json (403 -> blocked, 302 with a cross-host location -> redirected, 200 -> ok, 429 -> rate_limited, thrown transport error -> error); parse_unlocker_json gets its own table-driven test. The stdio registration test is unchanged. Based on PR #155 (retry/backoff for #104); retry_utils.js classification is reused.
1 parent 8265890 commit cb34f46

5 files changed

Lines changed: 615 additions & 2 deletions

File tree

geo_utils.js

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
'use strict'; /*jslint node:true es9:true*/
2+
3+
// Pure helpers for the geo_fanout tool: validating the list of country exits
4+
// and turning a set of per-geo results into a single structured report where a
5+
// blocked / redirected / failed geo is a FIRST-CLASS classified result, not a
6+
// discarded error. No network or transport dependencies live here so the fanout
7+
// aggregation logic is unit-testable in isolation.
8+
9+
import {classify_response, OUTCOME} from './retry_utils.js';
10+
11+
// Per-geo status the caller sees in the aggregated report.
12+
export const GEO_STATUS = {
13+
OK: 'ok', // fetched successfully
14+
BLOCKED: 'blocked', // target refused this geo (403/451)
15+
REDIRECTED: 'redirected', // 3xx to a different host/path (geo gating signal)
16+
RATE_LIMITED: 'rate_limited',
17+
ERROR: 'error', // transient/fatal failure after retries
18+
};
19+
20+
// Map the Unlocker's JSON envelope to the shape build_geo_entry expects.
21+
// When the /request endpoint is called with format:'json', the gateway always
22+
// answers HTTP 200 and the TARGET's real result lives in the JSON body:
23+
// {status_code, headers, body}
24+
// where status_code is the target's HTTP status and headers are the target's
25+
// response headers (lowercased keys). The previous format:'raw' path only ever
26+
// exposed the gateway's 200, so a target 403/451/3xx was misclassified as ok.
27+
// This helper isolates that mapping so the real envelope shape is unit-testable.
28+
// Tolerant of a missing / non-object / string-but-JSON input: a value with no
29+
// recognizable status yields {status:null} and classify_response then treats it
30+
// as a transport-level failure rather than a silent success.
31+
export function parse_unlocker_json(data){
32+
let obj = data;
33+
if (typeof obj=='string')
34+
{
35+
try { obj = JSON.parse(obj); }
36+
catch(e){ obj = null; }
37+
}
38+
if (!obj || typeof obj!='object')
39+
return {status: null, headers: undefined, body: undefined};
40+
const status = typeof obj.status_code=='number'
41+
&& Number.isFinite(obj.status_code) ? obj.status_code : null;
42+
const headers = obj.headers && typeof obj.headers=='object'
43+
? obj.headers : undefined;
44+
return {status, headers, body: obj.body};
45+
}
46+
47+
// Validate and normalize a list of 2-letter ISO country codes. Lowercases,
48+
// trims, dedupes (preserving first-seen order), and rejects malformed entries
49+
// loudly so a typo never becomes a silently-dropped exit. Returns the clean
50+
// array; throws Error on any invalid code (callers want fast, explicit failure).
51+
export function normalize_geos(geos){
52+
if (!Array.isArray(geos) || geos.length===0)
53+
throw new Error('geos must be a non-empty array of 2-letter codes');
54+
const seen = new Set();
55+
const out = [];
56+
for (const raw of geos)
57+
{
58+
if (typeof raw!='string')
59+
throw new Error(`invalid geo (not a string): ${JSON.stringify(raw)}`);
60+
const code = raw.trim().toLowerCase();
61+
if (!/^[a-z]{2}$/.test(code))
62+
throw new Error(`invalid geo country code: "${raw}" `
63+
+`(expected 2 letters, e.g. "us", "de")`);
64+
if (seen.has(code))
65+
continue;
66+
seen.add(code);
67+
out.push(code);
68+
}
69+
return out;
70+
}
71+
72+
// Detect whether a successful-looking response is actually a geo redirect.
73+
// A 3xx with a Location header pointing at a different host is the classic
74+
// "Belgian visitor bounced to a different storefront" signal we must capture.
75+
function detect_redirect(status, headers, request_url){
76+
if (!(status>=300 && status<400))
77+
return null;
78+
const location = read_header(headers, 'location');
79+
if (!location)
80+
return {redirected: true, location: null, cross_host: false};
81+
let cross_host = false;
82+
try {
83+
const from = new URL(request_url);
84+
const to = new URL(location, request_url);
85+
cross_host = from.host!==to.host;
86+
} catch(e){
87+
cross_host = false;
88+
}
89+
return {redirected: true, location, cross_host};
90+
}
91+
92+
function read_header(headers, name){
93+
if (!headers || typeof headers!='object')
94+
return undefined;
95+
const target = name.toLowerCase();
96+
for (const key of Object.keys(headers))
97+
{
98+
if (key.toLowerCase()===target)
99+
return headers[key];
100+
}
101+
return undefined;
102+
}
103+
104+
// Map one classified outcome to the per-geo report status.
105+
function outcome_to_geo_status(outcome){
106+
switch (outcome)
107+
{
108+
case OUTCOME.SUCCESS: return GEO_STATUS.OK;
109+
case OUTCOME.REDIRECT: return GEO_STATUS.REDIRECTED;
110+
case OUTCOME.BLOCKED: return GEO_STATUS.BLOCKED;
111+
case OUTCOME.RATE_LIMITED: return GEO_STATUS.RATE_LIMITED;
112+
default: return GEO_STATUS.ERROR;
113+
}
114+
}
115+
116+
// Build the per-geo entry for a single settled attempt. `attempt` is the
117+
// normalized shape produced by the tool's executor:
118+
// {geo, url, response:{status, headers, exit_ip?}, body?} on a completed
119+
// request (status/headers are the TARGET's, parsed from the Unlocker
120+
// format:'json' envelope, NOT the gateway's 200), or
121+
// {geo, url, error:{code?, response?}} on a thrown failure.
122+
// `now_ms` is injectable for deterministic Retry-After math under test.
123+
export function build_geo_entry(attempt, now_ms = Date.now()){
124+
const obj = attempt && typeof attempt=='object' ? attempt : {};
125+
const geo = typeof obj.geo=='string' ? obj.geo : 'unknown';
126+
const url = typeof obj.url=='string' ? obj.url : null;
127+
// Normalize to the shape classify_response understands: a thrown error keeps
128+
// its {error} envelope; a completed request passes its {status, headers}.
129+
const classify_input = obj.error ? {error: obj.error}
130+
: (obj.response || {});
131+
const classification = classify_response(classify_input, now_ms);
132+
const status = classification.status;
133+
const headers = obj.response ? obj.response.headers
134+
: (obj.error && obj.error.response
135+
? obj.error.response.headers : undefined);
136+
// exit_ip is best-effort: the Unlocker format:'json' envelope carries the
137+
// target's headers, not the gateway's x-brd-* headers, so the exit IP is not
138+
// observable on this path. We surface whatever the executor explicitly
139+
// supplied (none, today) and otherwise null; we never fabricate an IP.
140+
const exit_ip = obj.response
141+
&& typeof obj.response.exit_ip=='string'
142+
? obj.response.exit_ip : null;
143+
144+
const redirect = detect_redirect(status, headers, url);
145+
let geo_status = outcome_to_geo_status(classification.outcome);
146+
if (redirect && redirect.redirected)
147+
geo_status = GEO_STATUS.REDIRECTED;
148+
149+
const entry = {
150+
geo,
151+
url,
152+
status: geo_status,
153+
http_status: status,
154+
exit_ip,
155+
outcome: classification.outcome,
156+
retry_after_ms: classification.retry_after_ms,
157+
reason: classification.reason,
158+
redirect: redirect || null,
159+
};
160+
// The rendered target body (markdown or raw) when the executor captured one.
161+
// Kept out of the entry entirely when absent so an error/blocked geo is not
162+
// padded with an empty string that reads like real content.
163+
if (obj.body!==undefined)
164+
entry.body = obj.body;
165+
return entry;
166+
}
167+
168+
// Aggregate all per-geo entries into one report. Every geo appears exactly once
169+
// regardless of success/failure; nothing is dropped. The summary makes the
170+
// "this geo was blocked / redirected" fact queryable at a glance.
171+
export function summarize_fanout(entries){
172+
const list = Array.isArray(entries) ? entries : [];
173+
const summary = {
174+
total: list.length,
175+
ok: 0,
176+
blocked: 0,
177+
redirected: 0,
178+
rate_limited: 0,
179+
error: 0,
180+
};
181+
for (const e of list)
182+
{
183+
switch (e.status)
184+
{
185+
case GEO_STATUS.OK: summary.ok++; break;
186+
case GEO_STATUS.BLOCKED: summary.blocked++; break;
187+
case GEO_STATUS.REDIRECTED: summary.redirected++; break;
188+
case GEO_STATUS.RATE_LIMITED: summary.rate_limited++; break;
189+
default: summary.error++; break;
190+
}
191+
}
192+
return {
193+
summary,
194+
any_blocked: summary.blocked>0,
195+
any_redirected: summary.redirected>0,
196+
results: list,
197+
};
198+
}

server.js

Lines changed: 84 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ import {parse_google_search_response} from './search_utils.js';
1010
import {dataset_id_schema, filter_schema, metadata_to_fields, FILTER_OPERATORS}
1111
from './search_dataset_schema.js';
1212
import {classify_response, should_retry} from './retry_utils.js';
13+
import {normalize_geos, parse_unlocker_json, build_geo_entry, summarize_fanout}
14+
from './geo_utils.js';
1315
import {createRequire} from 'node:module';
1416
import {remark} from 'remark';
1517
import strip from 'strip-markdown';
@@ -25,7 +27,7 @@ const base_timeout = process.env.BASE_TIMEOUT
2527
const base_max_retries = Math.max(0,
2628
Math.min(parseInt(process.env.BASE_MAX_RETRIES || '0', 10) || 0, 3));
2729
const pro_mode_tools = ['search_engine', 'scrape_as_markdown',
28-
'search_engine_batch', 'scrape_batch', 'discover'];
30+
'search_engine_batch', 'scrape_batch', 'discover', 'geo_fanout'];
2931
const tool_groups = process.env.GROUPS ?
3032
process.env.GROUPS.split(',').map(g=>g.trim().toLowerCase())
3133
.filter(Boolean) : [];
@@ -74,7 +76,7 @@ if (!api_token)
7476

7577
const sleep = ms=>new Promise(resolve=>setTimeout(resolve, ms));
7678

77-
// Backoff knobs (overridable via env) used by base_request.
79+
// Backoff knobs (overridable via env) used by base_request and geo_fanout.
7880
// Issue #104: bursts of MCP calls hit intermittent 502/504 from the gateway and
7981
// there was no backoff guidance. We now classify each failure and retry only the
8082
// transient ones with exponential backoff + full jitter, honoring Retry-After.
@@ -429,6 +431,86 @@ addTool({
429431
}),
430432
});
431433

434+
addTool({
435+
name: 'geo_fanout',
436+
description: 'Fetch the SAME url from multiple country exits in parallel and '
437+
+'return one structured report. A geo that is blocked (403/451), '
438+
+'redirected (3xx to a different host), rate-limited (429) or fails '
439+
+'transiently becomes a FIRST-CLASS classified result, not a discarded '
440+
+'error. Ideal for detecting geo-gating, regional price/availability '
441+
+'differences, and access denial across countries.',
442+
annotations: {
443+
title: 'Geo Fanout',
444+
readOnlyHint: true,
445+
openWorldHint: true,
446+
},
447+
parameters: z.object({
448+
url: z.string().url(),
449+
countries: z.array(z.string().length(2))
450+
.min(1)
451+
.max(10)
452+
.describe('2-letter ISO country codes to fan the request across '
453+
+'(e.g., ["de", "be", "fr"]). Deduped; max 10.'),
454+
data_format: z.enum(['raw', 'markdown'])
455+
.optional()
456+
.default('markdown')
457+
.describe('Response body format per geo (default: markdown).'),
458+
}),
459+
execute: tool_fn('geo_fanout', async({url, countries, data_format}, ctx)=>{
460+
const geos = normalize_geos(countries);
461+
const now = Date.now();
462+
const want_markdown = data_format=='markdown';
463+
const attempts = geos.map(geo=>(async()=>{
464+
try {
465+
// format:'json' makes the Unlocker return a JSON envelope
466+
// {status_code, headers, body} where status_code is the TARGET's
467+
// HTTP status and headers are the TARGET's response headers. With
468+
// the previous format:'raw' we only ever saw the gateway's 200,
469+
// so a target 403/451/3xx was misclassified as ok. We classify on
470+
// the real target status here. data_format still controls how the
471+
// target body is rendered (markdown vs the raw body) inside the
472+
// envelope; the 3xx is surfaced in the envelope WITHOUT the
473+
// Unlocker following it, so a redirect keeps its Location header.
474+
const response = await base_request({
475+
url: 'https://api.brightdata.com/request',
476+
method: 'POST',
477+
data: {
478+
url,
479+
zone: unlocker_zone,
480+
format: 'json',
481+
...want_markdown ? {data_format: 'markdown'} : {},
482+
country: geo,
483+
},
484+
headers: api_headers(ctx.clientName, 'geo_fanout'),
485+
responseType: 'json',
486+
});
487+
const parsed = parse_unlocker_json(response.data);
488+
let body = parsed.body;
489+
if (want_markdown && typeof body=='string' && body)
490+
{
491+
body = (await remark()
492+
.use(strip, {keep: ['link', 'linkReference', 'code',
493+
'inlineCode']})
494+
.process(body)).value;
495+
}
496+
return build_geo_entry({
497+
geo,
498+
url,
499+
body,
500+
response: {
501+
status: parsed.status,
502+
headers: parsed.headers,
503+
},
504+
}, now);
505+
} catch(e){
506+
return build_geo_entry({geo, url, error: e}, now);
507+
}
508+
})());
509+
const entries = await Promise.all(attempts);
510+
return JSON.stringify(summarize_fanout(entries), null, 2);
511+
}),
512+
});
513+
432514
addTool({
433515
name: 'scrape_as_html',
434516
description: 'Scrape a single webpage URL with advanced options for '

test/geo-fanout-tool.test.js

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
'use strict'; /*jslint node:true es9:true*/
2+
import test from 'node:test';
3+
import assert from 'node:assert/strict';
4+
import {fileURLToPath} from 'node:url';
5+
import {dirname, resolve} from 'node:path';
6+
import {Client} from '@modelcontextprotocol/sdk/client/index.js';
7+
import {StdioClientTransport} from '@modelcontextprotocol/sdk/client/stdio.js';
8+
9+
const test_dir = dirname(fileURLToPath(import.meta.url));
10+
const repo_root = resolve(test_dir, '..');
11+
12+
test('geo_fanout tool is registered and well-formed over stdio', async()=>{
13+
const env = {
14+
...process.env,
15+
API_TOKEN: 'dummy-token',
16+
PRO_MODE: 'true',
17+
};
18+
const client = new Client(
19+
{name: 'geo-fanout-test', version: '0.0.1'},
20+
{capabilities: {tools: {}}});
21+
const transport = new StdioClientTransport({
22+
command: process.execPath,
23+
args: ['server.js'],
24+
cwd: repo_root,
25+
env,
26+
});
27+
try {
28+
await client.connect(transport);
29+
const {tools} = await client.listTools();
30+
const geo_fanout = tools.find(tool=>tool.name=='geo_fanout');
31+
assert.ok(geo_fanout, 'geo_fanout tool is exposed');
32+
assert.match(geo_fanout.description, /first-class/i,
33+
'description documents first-class classified results');
34+
const props = geo_fanout.inputSchema?.properties || {};
35+
assert.ok(props.url, 'geo_fanout exposes a url parameter');
36+
assert.ok(props.countries, 'geo_fanout exposes a countries parameter');
37+
} finally {
38+
await client.close();
39+
}
40+
});

0 commit comments

Comments
 (0)