From 747e827fefb45a8a320edca56faf587eabac6baa Mon Sep 17 00:00:00 2001 From: vdbhb59 Date: Thu, 1 May 2025 14:16:17 +0530 Subject: [PATCH] Synced fix(google): Use updated Google search endpoint via unixfox's research for SearXNG (https://github.com/Ahwxorg/LibreY/blob/b531406ce3657224803dc55b1b05e7ce87d73813/engines/text/google.php) Add initial Bing support Note, Bing obfuscates anchor links but it was trivial to determine. (https://github.com/Ahwxorg/LibreY/commit/f9f51c6b74934f9b721351dcf89533213ffb2ec5) Update bing.php, decode URL Added urldecode to base64uri-decoded URL to enable proper parsing by get_base_url. (https://github.com/Ahwxorg/LibreY/commit/97c085cf951f82981575dd8858476c7c0c2e4bc6) Ignore Bing's relative links Updated parser for Bing links to ignore links that don't fit "results", such as relative links. Only non-obfuscated links and de-obfuscated absolute links will correctly make it through the parser. (https://github.com/Ahwxorg/LibreY/commit/7f12ad2950aeb58e2fc7a7ed136079022f409ddd) https://github.com/Ahwxorg/LibreY/pull/218 https://github.com/Ahwxorg/LibreY/pull/215 --- engines/text/bing.php | 92 +++++++++++++++++++++++++++++++++++++++++ engines/text/google.php | 31 +++++++++++--- engines/text/text.php | 9 +++- 3 files changed, 125 insertions(+), 7 deletions(-) create mode 100644 engines/text/bing.php diff --git a/engines/text/bing.php b/engines/text/bing.php new file mode 100644 index 0000000..0d3a11c --- /dev/null +++ b/engines/text/bing.php @@ -0,0 +1,92 @@ +query)); + + $results_language = $this->opts->language; + $number_of_results = $this->opts->number_of_results; + + // TODO figure out how to not autocorrect + $url = "https://www.bing.com/search?q=$query_encoded&first=" . ((10 * $this->page) + 1); + + // TODO language setting + if (!is_null($results_language)) + $url .= "&lang=$results_language"; + + return $url; + } + + public function parse_results($response) { + $results = array(); + $xpath = get_xpath($response); + + if (!$xpath) + return $results; + + foreach($xpath->query("//ol[@id='b_results']//li") as $result) { + $href_url = $xpath->evaluate(".//h2//a//@href", $result)[0]; + + if ($href_url == null) + continue; + + $possible_url = $href_url->textContent; + + $possible_url_query = parse_url($possible_url, PHP_URL_QUERY); + + if ($possible_url_query == false) + continue; + + parse_str($possible_url_query, $possible_url); + + if (!array_key_exists('u', $possible_url)) + continue; + + $possible_url = $possible_url['u']; + + if (str_starts_with($possible_url, "a1aHR0c")) + { + // First two characters are irrelevant, strip for later + $possible_url = substr($possible_url, 2); + } + if (str_starts_with($possible_url, "aHR0c")) + { + // Base64 "coded", extract and decode + $possible_url = str_replace('-', '+', $possible_url); + $possible_url = str_replace('_', '/', $possible_url); + $url = urldecode(base64_decode($possible_url, true)); + } else + $url = $possible_url; + + if (str_starts_with($url, "a1")) + continue; // It's probably a Bing-relative link such as for video, skip it. + + if (!empty($results) && array_key_exists("url", $results) && end($results)["url"] == $url->textContent) + continue; + + $title = $xpath->evaluate(".//h2//a", $result)[0]; + + if ($title == null) + continue; + + $title = $title->textContent; + + $description = ($xpath->evaluate(".//div[contains(@class, 'b_caption')]//p", $result)[0] ?? null) ?->textContent ?? ''; + + array_push($results, + array ( + "title" => htmlspecialchars($title), + "url" => htmlspecialchars($url), + // base_url is to be removed in the future, see #47 + "base_url" => htmlspecialchars(get_base_url($url)), + "description" => $description == null ? + TEXTS["result_no_description"] : + htmlspecialchars($description) + ) + ); + + } + return $results; + } + + } +?> \ No newline at end of file diff --git a/engines/text/google.php b/engines/text/google.php index 7a0d589..dd8bf4b 100644 --- a/engines/text/google.php +++ b/engines/text/google.php @@ -1,6 +1,24 @@ arc_id = "srp_"; + + for ($i = 0; $i < 24; $i++) { + $c = random_int(0, strlen($charset) - 1); + $this->arc_id .= $charset[$c]; + } + + $this->arc_id .= "_1"; + $this->arc_timestamp = time(); + } + public function get_request_url() { + if ($this->arc_timestamp + 3600 < time()) + $this->generate_arc_id(); $query_encoded = str_replace("%22", "\"", urlencode($this->query)); $results = array(); @@ -8,6 +26,7 @@ $domain = $this->opts->google_domain; $results_language = $this->opts->language; $number_of_results = $this->opts->number_of_results; + $arc_page = sprintf("%02d", $this->page * 10); $url = "https://www.google.$domain/search?q=$query_encoded&nfpr=1&start=$this->page"; @@ -22,6 +41,8 @@ if (isset($_COOKIE["safe_search"])) $url .= "&safe=medium"; + $url .= "&asearch=arc&async=arc_id:$this->arc_id$arc_page,use_ac:true,_fmt:html"; + return $url; } @@ -33,21 +54,21 @@ if (!$xpath) return $results; - $didyoumean = $xpath->query(".//a[@class='gL9Hy']")[0]; + $didyoumean = $xpath->query(".//p[@class='QRYxYe NNMgCf']/a/b/i")[0]; if (!is_null($didyoumean)) array_push($results, array( "did_you_mean" => $didyoumean->textContent )); - foreach($xpath->query("//div[@id='search']//div[contains(@class, 'g')]") as $result) { - $url = $xpath->evaluate(".//div[@class='yuRUbf']//a/@href", $result)[0]; + foreach($xpath->query("//div[@class='MjjYud']") as $result) { + $url = $xpath->evaluate(".//a[@class='zReHs']/@href", $result)[0]; if ($url == null) continue; if (!empty($results) && array_key_exists("url", end($results)) && end($results)["url"] == $url->textContent) - continue; + continue; $url = $url->textContent; @@ -76,4 +97,4 @@ return $results; } } -?> +?> \ No newline at end of file diff --git a/engines/text/text.php b/engines/text/text.php index bf824ac..6eb28af 100644 --- a/engines/text/text.php +++ b/engines/text/text.php @@ -1,6 +1,6 @@ +?> \ No newline at end of file