diff --git a/lib/bingcache-todo-fix.php b/lib/bingcache-todo-fix.php index c52fbfd..e69de29 100644 --- a/lib/bingcache-todo-fix.php +++ b/lib/bingcache-todo-fix.php @@ -1,144 +0,0 @@ - - -new bingcache(); - -class bingcache{ - - public function __construct(){ - - if( - !isset($_GET["s"]) || - $this->validate_url($_GET["s"]) === false - ){ - - var_dump($this->validate_url($_GET["s"])); - $this->do404("Please provide a valid URL."); - } - - $url = $_GET["s"]; - - $curlproc = curl_init(); - - curl_setopt( - $curlproc, - CURLOPT_URL, - "https://www.bing.com/search?q=url%3A" . - urlencode($url) - ); - - curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding - curl_setopt( - $curlproc, - CURLOPT_HTTPHEADER, - ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0", - "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", - "Accept-Language: en-US,en;q=0.5", - "Accept-Encoding: gzip", - "DNT: 1", - "Connection: keep-alive", - "Upgrade-Insecure-Requests: 1", - "Sec-Fetch-Dest: document", - "Sec-Fetch-Mode: navigate", - "Sec-Fetch-Site: none", - "Sec-Fetch-User: ?1"] - ); - - curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); - curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); - curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); - curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 5); - - $data = curl_exec($curlproc); - - if(curl_errno($curlproc)){ - - $this->do404("Failed to connect to bing servers. Please try again later."); - } - - curl_close($curlproc); - - preg_match( - '/
/', - $data, - $keys - ); - - print_r($keys); - - if(count($keys) === 0){ - - $this->do404("Bing has not archived this URL."); - } - - $keys = explode("|", $keys[1]); - $count = count($keys); - - //header("Location: https://cc.bingj.com/cache.aspx?d=" . $keys[$count - 2] . "&w=" . $keys[$count - 1]); - echo("Location: https://cc.bingj.com/cache.aspx?d=" . $keys[$count - 2] . "&w=" . $keys[$count - 1]); - } - - public function do404($text){ - - include "lib/frontend.php"; - $frontend = new frontend(); - - echo - $frontend->load( - "error.html", - [ - "title" => "Shit", - "text" => $text - ] - ); - - die(); - } - - public function validate_url($url){ - - $url_parts = parse_url($url); - - // check if required parts are there - if( - !isset($url_parts["scheme"]) || - !( - $url_parts["scheme"] == "http" || - $url_parts["scheme"] == "https" - ) || - !isset($url_parts["host"]) - ){ - return false; - } - - if( - // if its not an RFC-valid URL - !filter_var($url, FILTER_VALIDATE_URL) - ){ - return false; - } - - $ip = - str_replace( - ["[", "]"], // handle ipv6 - "", - $url_parts["host"] - ); - - // if its not an IP - if(!filter_var($ip, FILTER_VALIDATE_IP)){ - - // resolve domain's IP - $ip = gethostbyname($url_parts["host"] . "."); - } - - // check if its localhost - return filter_var( - $ip, - FILTER_VALIDATE_IP, FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE - ); - } -} diff --git a/scraper/coccoc.php b/scraper/coccoc.php index 8baf371..5a045d1 100644 --- a/scraper/coccoc.php +++ b/scraper/coccoc.php @@ -164,6 +164,13 @@ class coccoc{ throw new Exception("Failed to decode JSON"); } + if( + isset($html["captcha"]) && + (int)$html["captcha"] === 1 + ){ + + throw new Exception("Coc Coc returned a Captcha"); + } if(!isset($html["search"]["search_results"])){ diff --git a/scraper/google.php b/scraper/google.php index 4742971..83c4d01 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -705,7 +705,7 @@ class google{ } - private function unshit_thumb($url){ + private function unshit_thumb($url, $get_bigger_res = false){ // https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQINE2vbnNLHXqoZr3RVsaEJFyOsj1_BiBnJch-e1nyz3oia7Aj5xVj // https://i.ytimg.com/vi/PZVIyA5ER3Y/mqdefault.jpg?sqp=-oaymwEFCJQBEFM&rs=AMzJL3nXeaCpdIar-ltNwl82Y82cIJfphA @@ -714,7 +714,7 @@ class google{ if( isset($parts["host"]) && preg_match( - '/tbn.*\.gstatic\.com/', + '/(?:encrypted-)?tbn.*\.gstatic\.com/', $parts["host"] ) ){ @@ -723,7 +723,26 @@ class google{ if(isset($params["q"])){ - return "https://" . $parts["host"] . "/images?q=" . $params["q"]; + if($get_bigger_res){ + + // this method doesnt always work, but does work for wiki thumbnails + return + "https://" . $parts["host"] . "/images?q=tbn:" . + $this->base64url_encode( + substr( + $this->base64url_decode( + explode( + ":", + $params["q"])[1] + ), + 0, + 29 + ) + ); + }else{ + + return "https://" . $parts["host"] . "/images?q=" . $params["q"]; + } } } @@ -1591,9 +1610,12 @@ class google{ if(count($img) !== 0){ $thumb = - $this->fuckhtml - ->getTextContent( - $img[0]["attributes"]["src"] + $this->unshit_thumb( + $this->fuckhtml + ->getTextContent( + $img[0]["attributes"]["src"] + ), + true ); } @@ -2976,6 +2998,20 @@ class google{ return $time; } + function base64url_decode($data){ + + $b64 = strtr($data, "-_", "+/"); + $pad = strlen($b64) % 4; + if ($pad) $b64 .= str_repeat("=", 4 - $pad); + + return base64_decode($b64); + } + + function base64url_encode($data){ + + return rtrim(strtr(base64_encode($data), "+/", "-_"), "="); + } + private function detect_sorry(){ $captcha_form =