fix(google): Use updated Google search endpoint via unixfox's research for SearXNG
(b531406ce3/engines/text/google.php)

Add initial Bing support
Note, Bing obfuscates anchor links but it was trivial to determine.
(f9f51c6b74)

Update bing.php, decode URL
Added urldecode to base64uri-decoded URL to enable proper parsing by get_base_url.
(97c085cf95)

Ignore Bing's relative links
Updated parser for Bing links to ignore links that don't fit "results", such as relative links. Only non-obfuscated links and de-obfuscated absolute links will correctly make it through the parser.
(7f12ad2950)

https://github.com/Ahwxorg/LibreY/pull/218
https://github.com/Ahwxorg/LibreY/pull/215
This commit is contained in:
vdbhb59 2025-05-01 14:16:17 +05:30
commit 747e827fef
3 changed files with 125 additions and 7 deletions

92
engines/text/bing.php Normal file
View file

@ -0,0 +1,92 @@
<?php
class BingSearchRequest extends EngineRequest {
public function get_request_url() {
$query_encoded = str_replace("%22", "\"", urlencode($this->query));
$results_language = $this->opts->language;
$number_of_results = $this->opts->number_of_results;
// TODO figure out how to not autocorrect
$url = "https://www.bing.com/search?q=$query_encoded&first=" . ((10 * $this->page) + 1);
// TODO language setting
if (!is_null($results_language))
$url .= "&lang=$results_language";
return $url;
}
public function parse_results($response) {
$results = array();
$xpath = get_xpath($response);
if (!$xpath)
return $results;
foreach($xpath->query("//ol[@id='b_results']//li") as $result) {
$href_url = $xpath->evaluate(".//h2//a//@href", $result)[0];
if ($href_url == null)
continue;
$possible_url = $href_url->textContent;
$possible_url_query = parse_url($possible_url, PHP_URL_QUERY);
if ($possible_url_query == false)
continue;
parse_str($possible_url_query, $possible_url);
if (!array_key_exists('u', $possible_url))
continue;
$possible_url = $possible_url['u'];
if (str_starts_with($possible_url, "a1aHR0c"))
{
// First two characters are irrelevant, strip for later
$possible_url = substr($possible_url, 2);
}
if (str_starts_with($possible_url, "aHR0c"))
{
// Base64 "coded", extract and decode
$possible_url = str_replace('-', '+', $possible_url);
$possible_url = str_replace('_', '/', $possible_url);
$url = urldecode(base64_decode($possible_url, true));
} else
$url = $possible_url;
if (str_starts_with($url, "a1"))
continue; // It's probably a Bing-relative link such as for video, skip it.
if (!empty($results) && array_key_exists("url", $results) && end($results)["url"] == $url->textContent)
continue;
$title = $xpath->evaluate(".//h2//a", $result)[0];
if ($title == null)
continue;
$title = $title->textContent;
$description = ($xpath->evaluate(".//div[contains(@class, 'b_caption')]//p", $result)[0] ?? null) ?->textContent ?? '';
array_push($results,
array (
"title" => htmlspecialchars($title),
"url" => htmlspecialchars($url),
// base_url is to be removed in the future, see #47
"base_url" => htmlspecialchars(get_base_url($url)),
"description" => $description == null ?
TEXTS["result_no_description"] :
htmlspecialchars($description)
)
);
}
return $results;
}
}
?>

View file

@ -1,6 +1,24 @@
<?php <?php
class GoogleRequest extends EngineRequest { class GoogleRequest extends EngineRequest {
protected string $arc_id;
protected int $arc_timestamp = 0;
private function generate_arc_id() {
$charset = "01234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-";
$this->arc_id = "srp_";
for ($i = 0; $i < 24; $i++) {
$c = random_int(0, strlen($charset) - 1);
$this->arc_id .= $charset[$c];
}
$this->arc_id .= "_1";
$this->arc_timestamp = time();
}
public function get_request_url() { public function get_request_url() {
if ($this->arc_timestamp + 3600 < time())
$this->generate_arc_id();
$query_encoded = str_replace("%22", "\"", urlencode($this->query)); $query_encoded = str_replace("%22", "\"", urlencode($this->query));
$results = array(); $results = array();
@ -8,6 +26,7 @@
$domain = $this->opts->google_domain; $domain = $this->opts->google_domain;
$results_language = $this->opts->language; $results_language = $this->opts->language;
$number_of_results = $this->opts->number_of_results; $number_of_results = $this->opts->number_of_results;
$arc_page = sprintf("%02d", $this->page * 10);
$url = "https://www.google.$domain/search?q=$query_encoded&nfpr=1&start=$this->page"; $url = "https://www.google.$domain/search?q=$query_encoded&nfpr=1&start=$this->page";
@ -22,6 +41,8 @@
if (isset($_COOKIE["safe_search"])) if (isset($_COOKIE["safe_search"]))
$url .= "&safe=medium"; $url .= "&safe=medium";
$url .= "&asearch=arc&async=arc_id:$this->arc_id$arc_page,use_ac:true,_fmt:html";
return $url; return $url;
} }
@ -33,21 +54,21 @@
if (!$xpath) if (!$xpath)
return $results; return $results;
$didyoumean = $xpath->query(".//a[@class='gL9Hy']")[0]; $didyoumean = $xpath->query(".//p[@class='QRYxYe NNMgCf']/a/b/i")[0];
if (!is_null($didyoumean)) if (!is_null($didyoumean))
array_push($results, array( array_push($results, array(
"did_you_mean" => $didyoumean->textContent "did_you_mean" => $didyoumean->textContent
)); ));
foreach($xpath->query("//div[@id='search']//div[contains(@class, 'g')]") as $result) { foreach($xpath->query("//div[@class='MjjYud']") as $result) {
$url = $xpath->evaluate(".//div[@class='yuRUbf']//a/@href", $result)[0]; $url = $xpath->evaluate(".//a[@class='zReHs']/@href", $result)[0];
if ($url == null) if ($url == null)
continue; continue;
if (!empty($results) && array_key_exists("url", end($results)) && end($results)["url"] == $url->textContent) if (!empty($results) && array_key_exists("url", end($results)) && end($results)["url"] == $url->textContent)
continue; continue;
$url = $url->textContent; $url = $url->textContent;
@ -76,4 +97,4 @@
return $results; return $results;
} }
} }
?> ?>

View file

@ -1,6 +1,6 @@
<?php <?php
function get_engines() { function get_engines() {
return array("google", "duckduckgo", "brave", "yandex", "ecosia", "mojeek"); return array("google", "duckduckgo", "brave", "yandex", "ecosia", "mojeek", "bing");
} }
class TextSearch extends EngineRequest { class TextSearch extends EngineRequest {
@ -88,6 +88,11 @@
return new MojeekSearchRequest($opts, $mh); return new MojeekSearchRequest($opts, $mh);
} }
if ($engine == "bing") {
require_once "engines/text/bing.php";
return new BingSearchRequest($opts, $mh);
}
// if an invalid engine is selected, don't give any results // if an invalid engine is selected, don't give any results
return null; return null;
} }
@ -217,4 +222,4 @@
} }
} }
?> ?>