Self-Host

To Self-host @ https://4g.flossboxin.org.in
This commit is contained in:
vdbhb59 2025-02-21 22:04:53 +05:30
commit c6e404d2af
132 changed files with 34951 additions and 0 deletions

1860
scraper/brave.php Normal file

File diff suppressed because it is too large Load diff

145
scraper/crowdview.php Normal file
View file

@ -0,0 +1,145 @@
<?php
class crowdview{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("crowdview");
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
}
public function getfilters($page){
return [];
}
private function get($proxy, $url, $get = []){
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1"]
);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function web($get){
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
try{
$json = $this->get(
$proxy,
"https://crowdview-next-js.onrender.com/api/search-v3",
[
"query" => $search
]
);
}catch(Exception $error){
throw new Exception("Failed to fetch JSON");
}
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
$json = json_decode($json, true);
if($json === NULL){
throw new Exception("Failed to decode JSON");
}
foreach($json["results"] as $item){
$description = explode("<b>", $item["snippet"], 2);
$out["web"][] = [
"title" => $this->sanitize($item["title"]),
"description" => $this->sanitize($description[1]),
"url" => $item["link"],
"date" => strtotime($description[0]),
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
}
return $out;
}
private function sanitize($html){
return
trim(
$this->fuckhtml
->getTextContent(
html_entity_decode(
$html
)
),
". "
);
}
}

309
scraper/curlie.php Normal file
View file

@ -0,0 +1,309 @@
<?php
class curlie{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("curlie");
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
}
public function getfilters($page){
if($page != "web"){
return [];
}
return [
"lang" => [
"display" => "Language",
"option" => [
"any" => "Any language",
"en" => "English",
"de" => "German",
"fr" => "French",
"ja" => "Japanese",
"it" => "Italian",
"es" => "Spanish",
"ru" => "Russian",
"nl" => "Dutch",
"pl" => "Polish",
"tr" => "Turkish",
"da" => "Danish",
"sv" => "Swedish",
"no" => "Norwegian",
"is" => "Icelandic",
"fo" => "Faroese",
"fi" => "Finnish",
"et" => "Estonian",
"lt" => "Lithuanian",
"lv" => "Latvian",
"cy" => "Welsh",
"ga" => "Irish",
"gd" => "Scottish Gaelic",
"br" => "Breton",
"fy" => "Frisian",
"frr" => "North Frisian",
"gem" => "Saterland Frisian",
"lb" => "Luxembourgish",
"rm" => "Romansh",
"pt" => "Portuguese",
"ca" => "Catalan",
"gl" => "Galician",
"eu" => "Basque",
"ast" => "Asturian",
"an" => "Aragonese",
"fur" => "Friulan",
"sc" => "Sardinian",
"scn" => "Sicilian",
"oc" => "Occitan",
"be" => "Belarusian",
"cs" => "Czech",
"hu" => "Hungarian",
"sk" => "Slovak",
"uk" => "Ukrainian",
"csb" => "Kashubian",
"tt" => "Tatar",
"ba" => "Bashkir",
"os" => "Ossetian",
"sl" => "Slovene",
"sr" => "Serbian",
"hr" => "Croatian",
"bs" => "Bosnian",
"bg" => "Bulgarian",
"sq" => "Albanian",
"ro" => "Romanian",
"mk" => "Macedonian",
"el" => "Greek",
"iw" => "Hebrew",
"fa" => "Persian",
"ar" => "Arabic",
"ku" => "Kurdish",
"az" => "Azerbaijani",
"hy" => "Armenian",
"af" => "Afrikaans",
"sw" => "Kiswahili",
"uz" => "Uzbek",
"kk" => "Kazakh",
"ky" => "Kyrgyz",
"tg" => "Tajik",
"tk" => "Turkmen",
"ug" => "Uyghurche",
"hi" => "Hindi",
"si" => "Sinhalese",
"gu" => "Gujarati",
"ur" => "Urdu",
"mr" => "Marathi",
"pa" => "Punjabi",
"bn" => "Bengali",
"ta" => "Tamil",
"te" => "Telugu",
"kn" => "Kannada",
"zh_CN" => "Chinese Simplified",
"zh_TW" => "Chinese Traditional",
"ko" => "Korean",
"cfr" => "Taiwanese",
"th" => "Thai",
"vi" => "Vietnamese",
"in" => "Indonesian",
"ms" => "Malay",
"tl" => "Tagalog",
"eo" => "Esperanto",
"ia" => "Interlingua",
"la" => "Latin"
]
]
];
}
private function get($proxy, $url, $get = []){
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1"]
);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function web($get){
if($get["npt"]){
[$query, $proxy] = $this->backend->get($get["npt"], "web");
try{
$html = $this->get(
$proxy,
"https://curlie.org/" . $query,
[]
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}else{
$proxy = $this->backend->get_ip();
$query = [
"q" => $get["s"],
"start" => 0,
"stime" => 92452189 // ?
];
if($get["lang"] !== "any"){
$query["lang"] = $get["lang"];
}
try{
$html = $this->get(
$proxy,
"https://curlie.org/search",
$query
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}
$this->fuckhtml->load($html);
$nextpage =
$this->fuckhtml
->getElementsByClassName(
"next-page",
"a"
);
if(count($nextpage) !== 0){
$nextpage =
$this->backend->store(
$nextpage[0]["attributes"]["href"],
"web",
$proxy
);
}else{
$nextpage = null;
}
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => $nextpage,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
$items =
$this->fuckhtml
->getElementsByClassName(
"site-item",
"div"
);
foreach($items as $item){
$this->fuckhtml->load($item);
$a =
$this->fuckhtml
->getElementsByAttributeValue(
"target",
"_blank",
"a"
)[0];
$description =
$this->fuckhtml
->getElementsByClassName("site-descr");
if(count($description) !== 0){
$description =
$this->fuckhtml
->getTextContent(
$description[0]
);
}else{
$description = null;
}
$out["web"][] = [
"title" =>
$this->fuckhtml
->getTextContent(
$a
),
"description" => $description,
"url" =>
$this->fuckhtml
->getTextContent(
$a["attributes"]["href"]
),
"date" => null,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
}
return $out;
}
}

1967
scraper/ddg.php Normal file

File diff suppressed because it is too large Load diff

820
scraper/facebook.php Normal file
View file

@ -0,0 +1,820 @@
<?php
class facebook{
const get = 0;
const post = 1;
public function __construct(){
include "lib/nextpage.php";
$this->nextpage = new nextpage("fb");
include "lib/proxy_pool.php";
$this->proxy = new proxy_pool("facebook");
}
public function getfilters($page){
return [
"sort" => [
"display" => "Sort by",
"option" => [
"relevance" => "Relevance",
"most_recent" => "Most recent"
]
],
"newer" => [
"display" => "Newer than",
"option" => "_DATE"
],
"older" => [
"display" => "Older than",
"option" => "_DATE"
],
"live" => [
"display" => "Livestream",
"option" => [
"no" => "No",
"yes" => "Yes"
]
]
];
}
private function get($url, $get = [], $reqtype = self::get){
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
if($reqtype === self::get){
$headers = [
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0",
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1"
];
$url .= "?" . $get;
}else{
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
$headers = [
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0",
"Accept: */*",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip, deflate, br",
"Content-Type: application/x-www-form-urlencoded",
"X-FB-Friendly-Name: SearchCometResultsPaginatedResultsQuery",
//"X-FB-LSD: AVptQC4a16c",
//"X-ASBD-ID: 129477",
"Content-Length: " . strlen($get),
"Origin: https://www.facebook.com",
"DNT: 1",
"Connection: keep-alive",
"Referer: https://www.facebook.com/watch/",
"Cookie: datr=__GMZCgwVF5BbyvAtfJojQwg; oo=v1%7C3%3A1691641171; wd=955x995",
"Sec-Fetch-Dest: empty",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-origin",
"TE: trailers"
];
curl_setopt($curlproc, CURLOPT_POST, true);
curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get);
}
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->proxy->assign_proxy($curlproc);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function video($get){
$search = $get["s"];
$npt = $get["npt"];
$this->out = [
"status" => "ok",
"npt" => null,
"video" => [],
"author" => [],
"livestream" => [],
"playlist" => [],
"reel" => []
];
if($get["npt"]){
$nextpage =
json_decode(
$this->nextpage->get(
$npt,
"videos"
),
true
);
// parse next page
$this->video_nextpage($nextpage);
return $this->out;
}
// generate filter data
// {
// "rp_creation_time:0":"{\"name\":\"creation_time\",\"args\":\"{\\\"start_year\\\":\\\"2023\\\",\\\"start_month\\\":\\\"2023-08\\\",\\\"end_year\\\":\\\"2023\\\",\\\"end_month\\\":\\\"2023-08\\\",\\\"start_day\\\":\\\"2023-08-10\\\",\\\"end_day\\\":\\\"2023-08-10\\\"}\"}",
// "videos_sort_by:0":"{\"name\":\"videos_sort_by\",\"args\":\"Most Recent\"}",
// "videos_live:0":"{\"name\":\"videos_live\",\"args\":\"\"}"
// }
$filter = [];
$sort = $get["sort"];
$live = $get["live"];
$older = $get["older"];
$newer = $get["newer"];
if(
$older !== false ||
$newer !== false
){
if($older === false){
$older = time();
}
if($newer === false){
$newer = 0;
}
$filter["rp_creation_time:0"] =
json_encode(
[
"name" => "creation_time",
"args" =>
json_encode(
[
"start_year" => date("Y", $newer),
"start_month" => date("Y-m", $newer),
"end_year" => date("Y", $older),
"end_month" => date("Y-m", $older),
"start_day" => date("Y-m-d", $newer),
"end_day" => date("Y-m-d", $older)
]
)
]
);
}
if($sort != "relevance"){
$filter["videos_sort_by:0"] =
json_encode(
[
"name" => "videos_sort_by",
"args" => "Most Recent"
]
);
}
if($live != "no"){
$filter["videos_live:0"] = json_encode(
[
"name" => "videos_live",
"args" => ""
]
);
}
$req = [
"q" => $search
];
if(count($filter) !== 0){
$req["filters"] =
base64_encode(
json_encode(
$filter
)
);
}
/*
$html =
$this->get(
"https://www.facebook.com/watch/search/",
$req
);*/
$handle = fopen("scraper/facebook.html", "r");
$html = fread($handle, filesize("scraper/facebook.html"));
fclose($handle);
preg_match_all(
'/({"__bbox":.*,"sequence_number":0}})\]\]/',
$html,
$json
);
if(!isset($json[1][1])){
throw new Exception("Could not grep JSON body");
}
$json = json_decode($json[1][1], true);
foreach(
$json
["__bbox"]
["result"]
["data"]
["serpResponse"]
["results"]
["edges"]
as $result
){
$this->parse_edge($result);
}
// get nextpage data
if(
$json
["__bbox"]
["result"]
["data"]
["serpResponse"]
["results"]
["page_info"]
["has_next_page"]
== 1
){
preg_match(
'/handleWithCustomApplyEach\(ScheduledApplyEach,({.*})\);}\);}\);<\/script>/',
$html,
$nextpagedata
);
// [POST] https://www.facebook.com/api/graphql/
// FORM data, not JSON!
$nextpage = [
"av" => "0",
"__user" => null,
"__a" => null,
"__req" => "2",
"__hs" => null,
"dpr" => "1",
"__ccg" => null,
"__rev" => null,
// another client side token
"__s" => $this->randomstring(6) . ":" . $this->randomstring(6) . ":" . $this->randomstring(6),
"__hsi" => null,
// tracking fingerprint (probably generated using webgl)
"__dyn" => "7xeUmwlE7ibwKBWo2vwAxu13w8CewSwMwNw9G2S0im3y4o0B-q1ew65xO2O1Vw8G1Qw5Mx61vw9m1YwBgao6C0Mo5W3S7Udo5q4U2zxe2Gew9O222SUbEaU2eU5O0GpovU19pobodEGdw46wbS1LwTwNwLw8O1pwr86C16w",
"__csr" => $this->randomstring(null),
"__comet_req" => null,
"lsd" => null,
"jazoest" => null,
"__spin_r" => null,
"__spin_b" => null,
"__spin_t" => null,
"fb_api_caller_class" => "RelayModern",
"fb_api_req_friendly_name" => "SearchCometResultsPaginatedResultsQuery",
"variables" => [ // this is json
"UFI2CommentsProvider_commentsKey" => "SearchCometResultsInitialResultsQuery",
"allow_streaming" => false,
"args" => [
"callsite" => "comet:watch_search",
"config" => [
"exact_match" => false,
"high_confidence_config" => null,
"intercept_config" => null,
"sts_disambiguation" => null,
"watch_config" => null
],
"context" => [
"bsid" => null,
"tsid" => null
],
"experience" => [
"encoded_server_defined_params" => null,
"fbid" => null,
"type" => "WATCH_TAB_GLOBAL"
],
"filters" => [],
"text" => $search
],
"count" => 5,
"cursor" =>
$json
["__bbox"]
["result"]
["data"]
["serpResponse"]
["results"]
["page_info"]
["end_cursor"],
"displayCommentsContextEnableComment" => false,
"displayCommentsContextIsAdPreview" => false,
"displayCommentsContextIsAggregatedShare" => false,
"displayCommentsContextIsStorySet" => false,
"displayCommentsFeedbackContext" => null,
"feedLocation" => "SEARCH",
"feedbackSource" => 23,
"fetch_filters" => true,
"focusCommentID" => null,
"locale" => null,
"privacySelectorRenderLocation" => "COMET_STREAM",
"renderLocation" => "search_results_page",
"scale" => 1,
"stream_initial_count" => 0,
"useDefaultActor" => false,
"__relay_internal__pv__IsWorkUserrelayprovider" => false,
"__relay_internal__pv__IsMergQAPollsrelayprovider" => false,
"__relay_internal__pv__StoriesArmadilloReplyEnabledrelayprovider" => false,
"__relay_internal__pv__StoriesRingrelayprovider" => false
],
"server_timestamps" => "true",
"doc_id" => "6761275837251607" // is actually dynamic
];
// append filters to nextpage
foreach($filter as $key => $value){
$nextpage["variables"]["args"]["filters"][] =
$value;
}
$nextpagedata = json_decode($nextpagedata[1], true);
// get bsid
foreach($nextpagedata["require"] as $key){
foreach($key as $innerkey){
if(is_array($innerkey)){
foreach($innerkey as $inner_innerkey){
if(is_array($inner_innerkey)){
foreach($inner_innerkey as $inner_inner_innerkey){
if(
isset(
$inner_inner_innerkey
["variables"]
["args"]
["context"]
["bsid"]
)
){
$nextpage
["variables"]
["args"]
["context"]
["bsid"] =
$inner_inner_innerkey
["variables"]
["args"]
["context"]
["bsid"];
}
}
}
}
}
}
}
foreach($nextpagedata["define"] as $key){
if(isset($key[2]["haste_session"])){
$nextpage["__hs"] = $key[2]["haste_session"];
}
if(isset($key[2]["connectionClass"])){
$nextpage["__ccg"] = $key[2]["connectionClass"];
}
if(isset($key[2]["__spin_r"])){
$nextpage["__spin_r"] = (string)$key[2]["__spin_r"];
}
if(isset($key[2]["hsi"])){
$nextpage["__hsi"] = (string)$key[2]["hsi"];
}
if(
isset($key[2]["token"]) &&
!empty($key[2]["token"])
){
$nextpage["lsd"] = $key[2]["token"];
}
if(isset($key[2]["__spin_r"])){
$nextpage["__spin_r"] = (string)$key[2]["__spin_r"];
$nextpage["__rev"] = $nextpage["__spin_r"];
}
if(isset($key[2]["__spin_b"])){
$nextpage["__spin_b"] = $key[2]["__spin_b"];
}
if(isset($key[2]["__spin_t"])){
$nextpage["__spin_t"] = (string)$key[2]["__spin_t"];
}
}
preg_match(
'/{"u":"\\\\\/ajax\\\\\/qm\\\\\/\?__a=([0-9]+)&__user=([0-9]+)&__comet_req=([0-9]+)&jazoest=([0-9]+)"/',
$html,
$ajaxparams
);
if(count($ajaxparams) !== 5){
throw new Exception("Could not grep the AJAX parameters");
}
$nextpage["__a"] = $ajaxparams[1];
$nextpage["__user"] = $ajaxparams[2];
$nextpage["__comet_req"] = $ajaxparams[3];
$nextpage["jazoest"] = $ajaxparams[4];
/*
$handle = fopen("scraper/facebook-nextpage.json", "r");
$json = fread($handle, filesize("scraper/facebook-nextpage.json"));
fclose($handle);*/
$nextpage["variables"] = json_encode($nextpage["variables"]);
$this->video_nextpage($nextpage);
}
return $this->out;
}
private function video_nextpage($nextpage, $getcursor = false){
$json =
$this->get(
"https://www.facebook.com/api/graphql/",
$nextpage,
self::post
);
$json = json_decode($json, true);
if($json === null){
throw new Exception("Failed to decode next page JSON");
}
foreach(
$json
["data"]
["serpResponse"]
["results"]
["edges"]
as $result
){
$this->parse_edge($result);
}
if(
$json
["data"]
["serpResponse"]
["results"]
["page_info"]
["has_next_page"] == 1
){
$nextpage["variables"] = json_decode($nextpage["variables"], true);
$nextpage["variables"]["cursor"] =
$json
["data"]
["serpResponse"]
["results"]
["page_info"]
["end_cursor"];
$nextpage["variables"] = json_encode($nextpage["variables"]);
//change this for second call. after, it's static.
// TODO: csr also updates to longer string
$nextpage["__dyn"] = "7xeUmwlEnwn8K2WnFw9-2i5U4e0yoW3q322aew9G2S0zU20xi3y4o0B-q1ew65xOfxO1Vw8G11xmfz81s8hwGwQw9m1YwBgao6C2O0B85W3S7Udo5qfK0EUjwGzE2swwwJK2W2K0zK5o4q0GpovU19pobodEGdw46wbS1LwTwNwLw8O1pwr86C16w";
// TODO: change this on third and 6th call
//$nextpage["__s"] = $this->randomstring(6) . ":" . explode(":", $nextpage["__s"], 2)[1];
$this->out["npt"] = $this->nextpage->store(json_encode($nextpage), "videos");
}
}
private function parse_edge($edge){
$append = "video";
$edge =
$edge
["relay_rendering_strategy"]
["view_model"];
if(
strtolower(
$edge
["video_metadata_model"]
["video_broadcast_status"]
)
== "live"
){
// handle livestream
$duration = "_LIVE";
$append = "livestream";
$timetext = null;
$views =
(int)$edge
["video_metadata_model"]
["relative_time_string"];
$url_prefix = "https://www.facebook.com/watch/live/?v=";
}elseif(
stripos(
$edge
["video_metadata_model"]
["video_broadcast_status"],
"vod"
) !== false
){
// handle VOD format
$timetext = null;
$views =
(int)$edge
["video_metadata_model"]
["relative_time_string"];
$duration =
$this->hms2int(
$edge
["video_thumbnail_model"]
["video_duration_text"]
);
$url_prefix = "https://www.facebook.com/watch/live/?v=";
}else{
// handle normal format
$timetext =
explode(
" · ",
$edge
["video_metadata_model"]
["relative_time_string"],
2
);
if(count($timetext) === 2){
$views = $this->truncatedcount2int($timetext[1]);
}else{
$views = null;
}
$timetext = strtotime($timetext[0]);
$duration =
$this->hms2int(
$edge
["video_thumbnail_model"]
["video_duration_text"]
);
$url_prefix = "https://www.facebook.com/watch/?v=";
}
if(
isset(
$edge
["video_metadata_model"]
["video_owner_profile"]
["uri_token"]
)
){
$profileurl =
"https://www.facebook.com/watch/" .
$edge
["video_metadata_model"]
["video_owner_profile"]
["uri_token"];
}else{
$profileurl =
$edge
["video_metadata_model"]
["video_owner_profile"]
["url"];
}
$this->out[$append][] = [
"title" =>
$this->limitstrlen(
str_replace(
"\n",
" ",
$edge
["video_metadata_model"]
["title"]
),
100
),
"description" =>
empty(
$edge
["video_metadata_model"]
["save_description"]
) ?
null :
str_replace(
"\n",
" ",
$this->limitstrlen(
$edge
["video_metadata_model"]
["save_description"]
)
),
"author" => [
"name" =>
$edge
["video_metadata_model"]
["video_owner_profile"]
["name"],
"url" => $profileurl,
"avatar" => null
],
"date" => $timetext,
"duration" => $duration,
"views" => $views,
"thumb" =>
[
"url" =>
$edge
["video_thumbnail_model"]
["thumbnail_image"]
["uri"],
"ratio" => "16:9"
],
"url" =>
$url_prefix .
$edge
["video_click_model"]
["click_metadata_model"]
["video_id"]
];
}
private function randomstring($len){
if($len === null){
$str = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ123456789-";
$len = rand(141, 145);
$c = 61;
}else{
$str = "abcdefghijklmnopqrstuvwxyz123456789";
$c = 34;
}
$out = null;
for($i=0; $i<$len; $i++){
$out .= $str[rand(0, $c)];
}
return $out;
}
private function limitstrlen($text, $len = 300){
return explode("\n", wordwrap($text, $len, "\n"))[0];
}
private function hms2int($time){
$parts = explode(":", $time, 3);
$time = 0;
if(count($parts) === 3){
// hours
$time = $time + ((int)$parts[0] * 3600);
array_shift($parts);
}
if(count($parts) === 2){
// minutes
$time = $time + ((int)$parts[0] * 60);
array_shift($parts);
}
// seconds
$time = $time + (int)$parts[0];
return $time;
}
private function truncatedcount2int($number){
// decimal should always be 1 number long
$number = explode(" ", $number, 2);
$number = $number[0];
$unit = strtolower($number[strlen($number) - 1]);
$tmp = explode(".", $number, 2);
$number = (int)$number;
if(count($tmp) === 2){
$decimal = (int)$tmp[1];
}else{
$decimal = 0;
}
switch($unit){
case "k":
$exponant = 1000;
break;
case "m":
$exponant = 1000000;
break;
case "b";
$exponant = 1000000000;
break;
default:
$exponant = 1;
break;
}
return ($number * $exponant) + ($decimal * ($exponant / 10));
}
}

262
scraper/fivehpx.php Normal file
View file

@ -0,0 +1,262 @@
<?php
class fivehpx{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("fivehpx");
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
}
public function getfilters($page){
return [
"sort" => [
"display" => "Sort",
"option" => [
"relevance" => "Relevance",
"pulse" => "Pulse",
"newest" => "Newest"
]
]
];
}
private function get($proxy, $url, $get = [], $post_data = null){
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
if($post_data === null){
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: same-origin",
"Sec-Fetch-User: ?1",
"Priority: u=0, i",
"TE: trailers"]
);
}else{
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: */*",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"Referer: https://500px.com/",
"content-type: application/json",
//"x-csrf-token: undefined",
"x-500px-source: Search",
"Content-Length: " . strlen($post_data),
"Origin: https://500px.com",
"DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive",
// "Cookie: _pin_unauth, _fbp, _sharedID, _sharedID_cst",
"Sec-Fetch-Dest: empty",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-site",
"Priority: u=4",
"TE: trailers"]
);
// set post data
curl_setopt($curlproc, CURLOPT_POST, true);
curl_setopt($curlproc, CURLOPT_POSTFIELDS, $post_data);
}
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
// http2 bypass
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function image($get){
if($get["npt"]){
[$pagination, $proxy] =
$this->backend->get(
$get["npt"], "images"
);
$pagination = json_decode($pagination, true);
$search = $pagination["search"];
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
$pagination = [
"sort" => strtoupper($get["sort"]),
"search" => $search,
"filters" => [],
"nlp" => false,
];
}
try{
$json =
$this->get(
$proxy,
"https://api.500px.com/graphql",
[],
json_encode([
"operationName" => "PhotoSearchPaginationContainerQuery",
"variables" => $pagination,
"query" =>
'query PhotoSearchPaginationContainerQuery(' .
(isset($pagination["cursor"]) ? '$cursor: String, ' : "") .
'$sort: PhotoSort, $search: String!, $filters: [PhotoSearchFilter!], $nlp: Boolean) { ...PhotoSearchPaginationContainer_query_1vzAZD} fragment PhotoSearchPaginationContainer_query_1vzAZD on Query { photoSearch(sort: $sort, first: 100, ' .
(isset($pagination["cursor"]) ? 'after: $cursor, ' : "") .
'search: $search, filters: $filters, nlp: $nlp) { edges { node { id legacyId canonicalPath name description width height images(sizes: [33, 36]) { size url id } } } totalCount pageInfo { endCursor hasNextPage } }}'
])
);
}catch(Exception $error){
throw new Exception("Failed to fetch graphQL object");
}
$json = json_decode($json, true);
if($json === null){
throw new Exception("Failed to decode graphQL object");
}
if(isset($json["errors"][0]["message"])){
throw new Exception("500px returned an API error: " . $json["errors"][0]["message"]);
}
if(!isset($json["data"]["photoSearch"]["edges"])){
throw new Exception("No edges returned by API");
}
$out = [
"status" => "ok",
"npt" => null,
"image" => []
];
foreach($json["data"]["photoSearch"]["edges"] as $image){
$image = $image["node"];
$title =
trim(
$this->fuckhtml
->getTextContent(
$image["name"]
) . ": " .
$this->fuckhtml
->getTextContent(
$image["description"]
)
, " :"
);
$small = $this->image_ratio(600, $image["width"], $image["height"]);
$large = $this->image_ratio(2048, $image["width"], $image["height"]);
$out["image"][] = [
"title" => $title,
"source" => [
[
"url" => $image["images"][1]["url"],
"width" => $large[0],
"height" => $large[1]
],
[
"url" => $image["images"][0]["url"],
"width" => $small[0],
"height" => $small[1]
]
],
"url" => "https://500px.com" . $image["canonicalPath"]
];
}
// get NPT token
if($json["data"]["photoSearch"]["pageInfo"]["hasNextPage"] === true){
$out["npt"] =
$this->backend->store(
json_encode([
"cursor" => $json["data"]["photoSearch"]["pageInfo"]["endCursor"],
"search" => $search,
"sort" => $pagination["sort"],
"filters" => [],
"nlp" => false
]),
"images",
$proxy
);
}
return $out;
}
private function image_ratio($longest_edge, $width, $height){
$ratio = [
$longest_edge / $width,
$longest_edge / $height
];
if($ratio[0] < $ratio[1]){
$ratio = $ratio[0];
}else{
$ratio = $ratio[1];
}
return [
floor($width * $ratio),
floor($height * $ratio)
];
}
}

161
scraper/ftm.php Normal file
View file

@ -0,0 +1,161 @@
<?php
class ftm{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("ftm");
}
public function getfilters($page){
return [];
}
private function get($proxy, $url, $search, $offset){
$curlproc = curl_init();
curl_setopt($curlproc, CURLOPT_URL, $url);
$payload =
json_encode(
[
"search" => $search,
"offset" => $offset
]
);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"Content-Length: " . strlen($payload),
"Content-Type: application/json",
"DNT: 1",
"Connection: keep-alive",
"Origin: https://findthatmeme.com",
"Referer: https://findthatmeme.com/?search=" . urlencode($search),
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1",
"X-Auth-Key: undefined",
"X-CSRF-Validation-Header: true"]
);
curl_setopt($curlproc, CURLOPT_POST, true);
curl_setopt($curlproc, CURLOPT_POSTFIELDS, $payload);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function image($get){
$out = [
"status" => "ok",
"npt" => null,
"image" => []
];
if($get["npt"]){
[$data, $proxy] = $this->backend->get($get["npt"], "images");
$data = json_decode($data, true);
$count = $data["count"];
$search = $data["search"];
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$count = 0;
$proxy = $this->backend->get_ip();
}
try{
$json =
json_decode(
$this->get(
$proxy,
"https://findthatmeme.com/api/v1/search",
$search,
$count
),
true
);
}catch(Exception $error){
throw new Exception("Failed to fetch JSON");
}
if($json === null){
throw new Exception("Failed to decode JSON");
}
foreach($json as $item){
$count++;
if($item["type"] == "VIDEO"){
$thumb = "thumb/" . $item["thumbnail"];
}else{
$thumb = $item["image_path"];
}
$out["image"][] = [
"title" => date("jS \of F Y @ g:ia", strtotime($item["created_at"])),
"source" => [
[
"url" =>
"https://s3.thehackerblog.com/findthatmeme/" .
$thumb,
"width" => null,
"height" => null
]
],
"url" => $item["source_page_url"]
];
}
$out["npt"] =
$this->backend->store(
json_encode([
"count" => $count,
"search" => $search
]),
"images",
$proxy
);
return $out;
}
}

320
scraper/ghostery.php Normal file
View file

@ -0,0 +1,320 @@
<?php
class ghostery{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("ghostery");
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
}
public function getfilters($page){
if($page != "web"){
return [];
}
return [
"country" => [
"display" => "Country",
"option" => [
"any" => "All regions",
"AR" => "Argentina",
"AU" => "Australia",
"AT" => "Austria",
"BE" => "Belgium",
"BR" => "Brazil",
"CA" => "Canada",
"CL" => "Chile",
"DK" => "Denmark",
"FI" => "Finland",
"FR" => "France",
"DE" => "Germany",
"HK" => "Hong Kong",
"IN" => "India",
"ID" => "Indonesia",
"IT" => "Italy",
"JP" => "Japan",
"KR" => "Korea",
"MY" => "Malaysia",
"MX" => "Mexico",
"NL" => "Netherlands",
"NZ" => "New Zealand",
"NO" => "Norway",
"CN" => "People's Republic of China",
"PL" => "Poland",
"PT" => "Portugal",
"PH" => "Republic of the Philippines",
"RU" => "Russia",
"SA" => "Saudi Arabia",
"ZA" => "South Africa",
"ES" => "Spain",
"SE" => "Sweden",
"CH" => "Switzerland",
"TW" => "Taiwan",
"TR" => "Turkey",
"GB" => "United Kingdom",
"US" => "United States"
]
]
];
}
private function get($proxy, $url, $get = [], $country){
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"Referer: https://ghosterysearch.com",
"DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive",
"Cookie: ctry=" . ($country == "any" ? "--" : $country) . "; noads=true",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: same-origin",
"Sec-Fetch-User: ?1",
"Priority: u=0, i"]
);
// http2 bypass
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function web($get){
if($get["npt"]){
[$query, $proxy] = $this->backend->get($get["npt"], "web");
parse_str($query, $query);
// country
$country = $query["c"];
unset($query["c"]);
$query = http_build_query($query);
try{
$html =
$this->get(
$proxy,
"https://ghosterysearch.com/search?" . $query,
[],
$country
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}else{
$proxy = $this->backend->get_ip();
try{
$html =
$this->get(
$proxy,
"https://ghosterysearch.com/search",
[
"q" => $get["s"]
],
$get["country"]
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
$this->fuckhtml->load($html);
$results_wrapper =
$this->fuckhtml
->getElementsByClassName(
"results",
"section"
);
if(count($results_wrapper) === 0){
throw new Exception("Failed to grep result section");
}
$this->fuckhtml->load($results_wrapper[0]);
// get search results
$results =
$this->fuckhtml
->getElementsByClassName(
"result",
"li"
);
if(count($results) === 0){
return $out;
}
foreach($results as $result){
$this->fuckhtml->load($result);
$a =
$this->fuckhtml
->getElementsByClassName(
"url",
"a"
);
if(count($a) === 0){
continue;
}
$a = $a[0];
$out["web"][] = [
"title" =>
$this->titledots(
$this->fuckhtml
->getTextContent(
$this->fuckhtml
->getElementsByTagName(
"h2"
)[0]
)
),
"description" =>
$this->titledots(
$this->fuckhtml
->getTextContent(
$this->fuckhtml
->getElementsByTagName(
"p"
)[0]
)
),
"url" =>
$this->fuckhtml
->getTextContent(
$a
["attributes"]
["href"]
),
"date" => null,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
}
$this->fuckhtml->load($html);
// get pagination token
$pagination_wrapper =
$this->fuckhtml
->getElementsByClassName(
"pagination",
"div"
);
if(count($pagination_wrapper) !== 0){
// found next page!
$this->fuckhtml->load($pagination_wrapper[0]);
$a =
$this->fuckhtml
->getElementsByTagName(
"a"
);
if(count($a) !== 0){
$q =
parse_url(
$this->fuckhtml
->getTextContent(
$a[count($a) - 1]
["attributes"]
["href"]
),
PHP_URL_QUERY
);
$out["npt"] =
$this->backend
->store(
$q . "&c=" . $get["country"],
"web",
$proxy
);
}
}
return $out;
}
private function titledots($title){
return trim($title, " .\t\n\r\0\x0B");
}
}

3448
scraper/google.php Normal file

File diff suppressed because it is too large Load diff

1054
scraper/google_cse.php Normal file

File diff suppressed because it is too large Load diff

435
scraper/greppr.php Normal file
View file

@ -0,0 +1,435 @@
<?php
class greppr{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("greppr");
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
}
public function getfilters($page){
return [];
}
private function get($proxy, $url, $get = [], $cookie = false){
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
if($cookie === false){
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1"]
);
}else{
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"Cookie: PHPSESSID=" . $cookie,
"DNT: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1"]
);
}
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$headers = [];
curl_setopt(
$curlproc,
CURLOPT_HEADERFUNCTION,
function($curlproc, $header) use (&$headers){
$len = strlen($header);
$header = explode(':', $header, 2);
if(count($header) < 2){
// ignore invalid headers
return $len;
}
$headers[strtolower(trim($header[0]))] = trim($header[1]);
return $len;
}
);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return [
"headers" => $headers,
"data" => $data
];
}
public function web($get, $first_attempt = true){
if($get["npt"]){
[$q, $proxy] = $this->backend->get($get["npt"], "web");
$q = json_decode($q, true);
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
}
// get token
// token[0] = static token that changes once a day
// token[1] = dynamic token that changes on every request
// token[1] = PHPSESSID cookie
$tokens = apcu_fetch("greppr_token");
if(
$tokens === false ||
$first_attempt === false // force token fetch
){
// we haven't gotten the token yet, get it
try{
$response =
$this->get(
$proxy,
"https://greppr.org",
[]
);
}catch(Exception $error){
throw new Exception("Failed to fetch search tokens");
}
$tokens = $this->parse_token($response);
if($tokens === false){
throw new Exception("Failed to grep search tokens");
}
}
try{
if($get["npt"]){
$params = [
$tokens[0] => $q["q"],
"s" => $q["s"],
"l" => 30,
"n" => $tokens[1]
];
}else{
$params = [
$tokens[0] => $search,
"n" => $tokens[1]
];
}
$searchresults = $this->get(
$proxy,
"https://greppr.org/search",
$params,
$tokens[2]
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
if(strlen($searchresults["data"]) === 0){
// redirected to main page, which means we got old token
// generate a new one
// ... unless we just tried to do that
if($first_attempt === false){
throw new Exception("Failed to get a new search token");
}
return $this->web($get, false);
}
// refresh the token with new data (this also triggers fuckhtml load)
$this->parse_token($searchresults, $tokens[2]);
// response object
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
// get results for later
$results =
$this->fuckhtml
->getElementsByClassName(
"result",
"div"
);
// check for next page
$next_elem =
$this->fuckhtml
->getElementsByClassName(
"pagination",
"ul"
);
if(count($next_elem) !== 0){
$this->fuckhtml->load($next_elem[0]);
$as =
$this->fuckhtml
->getElementsByClassName(
"page-link",
"a"
);
$break = false;
foreach($as as $a){
if($break === true){
parse_str(
$this->fuckhtml
->getTextContent(
$a["attributes"]["href"]
),
$values
);
$values = array_values($values);
$out["npt"] =
$this->backend->store(
json_encode(
[
"q" => $values[0],
"s" => $values[1]
]
),
"web",
$proxy
);
break;
}
if($a["attributes"]["href"] == "#"){
$break = true;
}
}
}
// scrape results
foreach($results as $result){
$this->fuckhtml->load($result);
$a =
$this->fuckhtml
->getElementsByTagName(
"a"
)[0];
$description =
$this->fuckhtml
->getElementsByClassName(
"highlightedDesc",
"p"
);
if(count($description) === 0){
$description = null;
}else{
$description =
$this->limitstrlen(
$this->fuckhtml
->getTextContent(
$description[0]
)
);
}
$date =
$this->fuckhtml
->getElementsByTagName(
"p"
);
$date =
strtotime(
explode(
":",
$this->fuckhtml
->getTextContent(
$date[count($date) - 1]["innerHTML"]
)
)[1]
);
$out["web"][] = [
"title" =>
$this->fuckhtml
->getTextContent(
$a["innerHTML"]
),
"description" => $description,
"url" =>
$this->fuckhtml
->getTextContent(
$a["attributes"]["href"]
),
"date" => $date,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
}
return $out;
}
private function parse_token($response, $cookie = false){
$this->fuckhtml->load($response["data"]);
$scripts =
$this->fuckhtml
->getElementsByTagName("script");
$found = false;
foreach($scripts as $script){
preg_match(
'/window\.location ?= ?\'\/search\?([^=]+).*&n=([0-9]+)/',
$script["innerHTML"],
$tokens
);
if(isset($tokens[1])){
$found = true;
break;
}
}
if($found === false){
return false;
}
$tokens = [
$tokens[1],
$tokens[2]
];
if($cookie !== false){
// we already specified a cookie, so use the one we have already
$tokens[] = $cookie;
apcu_store("greppr_token", $tokens);
return $tokens;
}
if(!isset($response["headers"]["set-cookie"])){
// server didn't send a cookie
return false;
}
// get cookie
preg_match(
'/PHPSESSID=([^;]+)/',
$response["headers"]["set-cookie"],
$cookie
);
if(!isset($cookie[1])){
// server sent an unexpected cookie
return false;
}
$tokens[] = $cookie[1];
apcu_store("greppr_token", $tokens);
return $tokens;
}
private function limitstrlen($text){
return explode("\n", wordwrap($text, 300, "\n"))[0];
}
}

258
scraper/imgur.php Normal file
View file

@ -0,0 +1,258 @@
<?php
class imgur{
public function __construct(){
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
include "lib/backend.php";
$this->backend = new backend("imgur");
}
public function getfilters($page){
return [
"sort" => [ // /score/
"display" => "Sort by",
"option" => [
"score" => "Highest scoring",
"relevance" => "Most relevant",
"time" => "Newest first"
]
],
"time" => [ // /score/day/
"display" => "Time posted",
"option" => [
"all" => "All time",
"day" => "Today",
"week" => "This week",
"month" => "This month",
"year" => "This year"
]
],
"format" => [ // q_type
"display" => "Format",
"option" => [
"any" => "Any format",
"jpg" => "JPG",
"png" => "PNG",
"gif" => "GIF",
"anigif" => "Animated GIF",
"album" => "Albums"
]
],
"size" => [ // q_size_px
"display" => "Size",
"option" => [
"any" => "Any size",
"small" => "Small (500px or less)",
"med" => "Medium (500px to 2000px)",
"big" => "Big (2000px to 5000px)",
"lrg" => "Large (5000px to 10000px)",
"huge" => "Huge (10000px and above)"
]
]
];
}
private function get($proxy, $url, $get = []){
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?scrolled&" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Referer: https://imgur.com/search/",
"Connection: keep-alive",
"Sec-Fetch-Dest: empty",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-origin",
"TE: trailers",
"X-Requested-With: XMLHttpRequest"]
);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function image($get){
if($get["npt"]){
[$filter, $proxy] =
$this->backend->get(
$get["npt"],
"images"
);
$filter = json_decode($filter, true);
$search = $filter["s"];
unset($filter["s"]);
$sort = $filter["sort"];
unset($filter["sort"]);
$time = $filter["time"];
unset($filter["time"]);
$format = $filter["format"];
unset($filter["format"]);
$size = $filter["size"];
unset($filter["size"]);
$page = $filter["page"];
unset($filter["page"]);
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
$sort = $get["sort"];
$time = $get["time"];
$format = $get["format"];
$size = $get["size"];
$page = 0;
$filter = [
"q" => $search
];
if($format != "any"){
$filter["q_type"] = $format;
}
if($size != "any"){
$filter["q_size_px"] = $size;
$filter["q_size_is_mpx"] = "off";
}
}
$out = [
"status" => "ok",
"npt" => null,
"image" => []
];
try{
$html =
$this->get(
$proxy,
"https://imgur.com/search/$sort/$time/page/$page",
$filter
);
}catch(Exception $error){
throw new Exception("Failed to fetch HTML");
}
$this->fuckhtml->load($html);
$posts =
$this->fuckhtml
->getElementsByClassName(
"post",
"div"
);
foreach($posts as $post){
$this->fuckhtml->load($post);
$image =
$this->fuckhtml
->getElementsByTagName("img")[0];
$image_url = "https:" . substr($this->fuckhtml->getTextContent($image["attributes"]["src"]), 0, -5);
$out["image"][] = [
"title" =>
$this->fuckhtml
->getTextContent(
$image["attributes"]["alt"]
),
"source" => [
[
"url" => $image_url . ".jpg",
"width" => null,
"height" => null
],
[
"url" => $image_url . "m.jpg",
"width" => null,
"height" => null
]
],
"url" =>
"https://imgur.com" .
$this->fuckhtml
->getTextContent(
$this->fuckhtml
->getElementsByClassName(
"image-list-link",
"a"
)
[0]
["attributes"]
["href"]
)
];
}
if(isset($out["image"][0])){
// store nextpage
$filter["s"] = $search;
$filter["sort"] = $sort;
$filter["time"] = $time;
$filter["format"] = $format;
$filter["size"] = $size;
$filter["page"] = $page + 1;
$out["npt"] =
$this->backend->store(
json_encode($filter),
"images",
$proxy
);
}
return $out;
}
}

476
scraper/marginalia.php Normal file
View file

@ -0,0 +1,476 @@
<?php
class marginalia{
public function __construct(){
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
include "lib/backend.php";
$this->backend = new backend("marginalia");
}
public function getfilters($page){
if(config::MARGINALIA_API_KEY === null){
$base = [
"adtech" => [
"display" => "Reduce adtech",
"option" => [
"no" => "No",
"yes" => "Yes"
]
],
"recent" => [
"display" => "Recent results",
"option" => [
"no" => "No",
"yes" => "Yes"
]
],
"intitle" => [
"display" => "Search in title",
"option" => [
"no" => "No",
"yes" => "Yes"
]
]
];
}else{
$base = [];
}
return array_merge(
$base,
[
"format" => [
"display" => "Format",
"option" => [
"any" => "Any format",
"html5" => "html5",
"xhtml" => "xhtml",
"html123" => "html123"
]
],
"file" => [
"display" => "Filetype",
"option" => [
"any" => "Any filetype",
"nomedia" => "Deny media",
"media" => "Contains media",
"audio" => "Contains audio",
"video" => "Contains video",
"archive" => "Contains archive",
"document" => "Contains document"
]
],
"javascript" => [
"display" => "Javascript",
"option" => [
"any" => "Allow JS",
"deny" => "Deny JS",
"require" => "Require JS"
]
],
"trackers" => [
"display" => "Trackers",
"option" => [
"any" => "Allow trackers",
"deny" => "Deny trackers",
"require" => "Require trackers"
]
],
"cookies" => [
"display" => "Cookies",
"option" => [
"any" => "Allow cookies",
"deny" => "Deny cookies",
"require" => "Require cookies"
]
],
"affiliate" => [
"display" => "Affiliate links in body",
"option" => [
"any" => "Allow affiliate links",
"deny" => "Deny affiliate links",
"require" => "Require affiliate links"
]
]
]
);
}
private function get($proxy, $url, $get = []){
$headers = [
"User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1"
];
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function web($get){
$search = [$get["s"]];
if(strlen($get["s"]) === 0){
throw new Exception("Search term is empty!");
}
$format = $get["format"];
$file = $get["file"];
foreach(
[
"javascript" => $get["javascript"],
"trackers" => $get["trackers"],
"cookies" => $get["cookies"],
"affiliate" => $get["affiliate"]
]
as $key => $value
){
if($value == "any"){ continue; }
switch($key){
case "javascript": $str = "js:true"; break;
case "trackers": $str = "special:tracking"; break;
case "cookies": $str = "special:cookies"; break;
case "affiliate": $str = "special:affiliate"; break;
}
if($value == "deny"){
$str = "-" . $str;
}
$search[] = $str;
}
if($format != "any"){
$search[] = "format:$format";
}
switch($file){
case "any": break;
case "nomedia": $search[] = "-special:media"; break;
case "media": $search[] = "special:media"; break;
default:
$search[] = "file:$file";
}
$search = implode(" ", $search);
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
// API scraper
if(config::MARGINALIA_API_KEY !== null){
try{
$json =
$this->get(
$this->backend->get_ip(), // no nextpage
"https://api.marginalia-search.com/" . config::MARGINALIA_API_KEY . "/search/" . urlencode($search),
[
"count" => 20
]
);
}catch(Exception $error){
throw new Exception("Failed to get JSON");
}
if($json == "Slow down"){
throw new Exception("The API key used is rate limited. Please try again in a few minutes.");
}
$json = json_decode($json, true);
foreach($json["results"] as $result){
$out["web"][] = [
"title" => $result["title"],
"description" => str_replace("\n", " ", $result["description"]),
"url" => $result["url"],
"date" => null,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
}
return $out;
}
// HTML parser
$proxy = $this->backend->get_ip();
if($get["npt"]){
[$params, $proxy] =
$this->backend->get(
$get["npt"],
"web"
);
try{
$html =
$this->get(
$proxy,
"https://old-search.marginalia.nu/search?" . $params
);
}catch(Exception $error){
throw new Exception("Failed to get HTML");
}
}else{
$params = [
"query" => $search
];
foreach(["adtech", "recent", "intitle"] as $v){
if($get[$v] == "yes"){
switch($v){
case "adtech": $params["adtech"] = "reduce"; break;
case "recent": $params["recent"] = "recent"; break;
case "adtech": $params["searchTitle"] = "title"; break;
}
}
}
try{
$html =
$this->get(
$proxy,
"https://old-search.marginalia.nu/search",
$params
);
}catch(Exception $error){
throw new Exception("Failed to get HTML");
}
}
$this->fuckhtml->load($html);
$sections =
$this->fuckhtml
->getElementsByClassName(
"card search-result",
"section"
);
foreach($sections as $section){
$this->fuckhtml->load($section);
$title =
$this->fuckhtml
->getElementsByClassName(
"title",
"a"
)[0];
$description =
$this->fuckhtml
->getElementsByClassName(
"description",
"p"
);
if(count($description) !== 0){
$description =
$this->fuckhtml
->getTextContent(
$description[0]
);
}else{
$description = null;
}
$sublinks = [];
$sublink_html =
$this->fuckhtml
->getElementsByClassName("additional-results");
if(count($sublink_html) !== 0){
$this->fuckhtml->load($sublink_html[0]);
$links =
$this->fuckhtml
->getElementsByTagName("a");
foreach($links as $link){
$sublinks[] = [
"title" =>
$this->fuckhtml
->getTextContent(
$link
),
"date" => null,
"description" => null,
"url" =>
$this->fuckhtml
->getTextContent(
$link["attributes"]["href"]
)
];
}
}
$out["web"][] = [
"title" =>
$this->fuckhtml
->getTextContent(
$title
),
"description" => $description,
"url" =>
$this->fuckhtml
->getTextContent(
$title["attributes"]["href"]
),
"date" => null,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => $sublinks,
"table" => []
];
}
// get next page
$this->fuckhtml->load($html);
$pagination =
$this->fuckhtml
->getElementsByAttributeValue(
"aria-label",
"pagination",
"nav"
);
if(count($pagination) === 0){
// no pagination
return $out;
}
$this->fuckhtml->load($pagination[0]);
$pages =
$this->fuckhtml
->getElementsByClassName(
"page-link",
"a"
);
$found_current_page = false;
foreach($pages as $page){
if(
stripos(
$page["attributes"]["class"],
"active"
) !== false
){
$found_current_page = true;
continue;
}
if($found_current_page){
// we found current page index, and we iterated over
// the next page <a>
$out["npt"] =
$this->backend->store(
parse_url(
$page["attributes"]["href"],
PHP_URL_QUERY
),
"web",
$proxy
);
break;
}
}
return $out;
}
}

1174
scraper/mojeek.php Normal file

File diff suppressed because it is too large Load diff

236
scraper/mwmbl.php Normal file
View file

@ -0,0 +1,236 @@
<?php
class mwmbl{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("mwmbl");
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
}
public function getfilters($page){
return [];
}
private function get($proxy, $url, $get = []){
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
// use http2
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"Referer: https://beta.mwmbl.org/",
"DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: same-origin",
"Priority: u=0, i",
"Sec-Fetch-User: ?1"]
);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function web($get){
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
try{
$html = $this->get(
$this->backend->get_ip(), // no next page!
"https://beta.mwmbl.org/",
[
"q" => $search
]
);
}catch(Exception $error){
throw new Exception("Failed to fetch HTML. If you're getting a timeout, make sure you have curl-impersonate setup.");
}
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
$this->fuckhtml->load($html);
$results =
$this->fuckhtml
->getElementsByClassName(
"result",
"li"
);
foreach($results as $result){
$this->fuckhtml->load($result);
$p =
$this->fuckhtml
->getElementsByTagName("p");
$sublinks = [];
$mores =
$this->fuckhtml
->getElementsByClassName(
"result-link-more",
"div"
);
foreach($mores as $more){
$this->fuckhtml->load($more);
$as =
$this->fuckhtml
->getElementsByClassName(
"more",
"a"
);
if(count($as) === 0){
// ?? invalid
continue;
}
$sublinks[] = [
"title" =>
$this->titledots(
$this->fuckhtml
->getTextContent(
$this->fuckhtml
->getElementsByClassName(
"more-title",
"span"
)[0]
)
),
"description" =>
$this->titledots(
$this->fuckhtml
->getTextContent(
$this->fuckhtml
->getElementsByClassName(
"more-extract",
"span"
)[0]
)
),
"url" =>
$this->fuckhtml
->getTextContent(
$as[0]
["attributes"]
["href"]
)
];
}
// reset
$this->fuckhtml->load($result);
$out["web"][] = [
"title" =>
$this->titledots(
$this->fuckhtml
->getTextContent(
$this->fuckhtml
->getElementsByClassName(
"title",
$p
)[0]
)
),
"description" =>
$this->titledots(
$this->fuckhtml
->getTextContent(
$this->fuckhtml
->getElementsByClassName(
"extract",
$p
)[0]
)
),
"url" =>
$this->fuckhtml
->getTextContent(
$this->fuckhtml
->getElementsByTagName("a")
[0]
["attributes"]
["href"]
),
"date" => null,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => $sublinks,
"table" => []
];
}
return $out;
}
private function titledots($title){
return rtrim($title, "");
}
}

439
scraper/pinterest.php Normal file
View file

@ -0,0 +1,439 @@
<?php
class pinterest{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("pinterest");
}
public function getfilters($page){
return [];
}
private function get($proxy, $url, $get = [], &$cookies, $header_data_post = null){
$curlproc = curl_init();
if($header_data_post === null){
// handling GET
// extract cookies
$cookies_tmp = [];
curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){
$length = strlen($header);
$header = explode(":", $header, 2);
if(trim(strtolower($header[0])) == "set-cookie"){
$cookie_tmp = explode("=", trim($header[1]), 2);
$cookies_tmp[trim($cookie_tmp[0])] =
explode(";", $cookie_tmp[1], 2)[0];
}
return $length;
});
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: application/json, text/javascript, */*, q=0.01",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"Referer: https://ca.pinterest.com/",
"X-Requested-With: XMLHttpRequest",
"X-APP-VERSION: 78f8764",
"X-Pinterest-AppState: active",
"X-Pinterest-Source-Url: /",
"X-Pinterest-PWS-Handler: www/index.js",
"screen-dpr: 1",
"is-preload-enabled: 1",
"DNT: 1",
"Sec-GPC: 1",
"Sec-Fetch-Dest: empty",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-origin",
"Connection: keep-alive",
"Alt-Used: ca.pinterest.com",
"Priority: u=0",
"TE: trailers"]
);
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
}else{
// handling POST (pagination)
$get = http_build_query($get);
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: application/json, text/javascript, */*, q=0.01",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"Content-Type: application/x-www-form-urlencoded",
"Content-Length: " . strlen($get),
"Referer: https://ca.pinterest.com/",
"X-Requested-With: XMLHttpRequest",
"X-APP-VERSION: 78f8764",
"X-CSRFToken: " . $cookies["csrf"],
"X-Pinterest-AppState: active",
"X-Pinterest-Source-Url: /search/pins/?rs=ac&len=2&q=" . urlencode($header_data_post) . "&eq=" . urlencode($header_data_post),
"X-Pinterest-PWS-Handler: www/search/[scope].js",
"screen-dpr: 1",
"is-preload-enabled: 1",
"Origin: https://ca.pinterest.com",
"DNT: 1",
"Sec-GPC: 1",
"Sec-Fetch-Dest: empty",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-origin",
"Connection: keep-alive",
"Alt-Used: ca.pinterest.com",
"Cookie: " . $cookies["cookie"],
"TE: trailers"]
);
curl_setopt($curlproc, CURLOPT_POST, true);
curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get);
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
// http2 bypass
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
if($header_data_post === null){
if(!isset($cookies_tmp["csrftoken"])){
throw new Exception("Failed to grep CSRF token");
}
$cookies = "";
foreach($cookies_tmp as $cookie_name => $cookie_value){
$cookies .= $cookie_name . "=" . $cookie_value . "; ";
}
$cookies = [
"csrf" => $cookies_tmp["csrftoken"],
"cookie" => rtrim($cookies, " ;")
];
}
curl_close($curlproc);
return $data;
}
public function image($get){
if($get["npt"]){
[$data, $proxy] =
$this->backend->get(
$get["npt"], "images"
);
$data = json_decode($data, true);
$search = $data["q"];
$cookies = $data["cookies"];
try{
$json =
$this->get(
$proxy,
"https://ca.pinterest.com/resource/BaseSearchResource/get/",
[
"source_url" => "/search/pins/?q=" . urlencode($search) . "&rs=typed",
"data" => json_encode(
[
"options" => [
"applied_unified_filters" => null,
"appliedProductFilters" => "---",
"article" => null,
"auto_correction_disabled" => false,
"corpus" => null,
"customized_rerank_type" => null,
"domains" => null,
"dynamicPageSizeExpGroup" => null,
"filters" => null,
"journey_depth" => null,
"page_size" => null,
"price_max" => null,
"price_min" => null,
"query_pin_sigs" => null,
"query" => $data["q"],
"redux_normalize_feed" => true,
"request_params" => null,
"rs" => "typed",
"scope" => "pins",
"selected_one_bar_modules" => null,
"source_id" => null,
"source_module_id" => null,
"source_url" => "/search/pins/?q=" . urlencode($search) . "&rs=typed",
"top_pin_id" => null,
"top_pin_ids" => null,
"bookmarks" => [
$data["bookmark"]
]
],
"context" => []
],
JSON_UNESCAPED_SLASHES
)
],
$cookies,
$search
);
}catch(Exception $error){
throw new Exception("Failed to fetch JSON");
}
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
// https://ca.pinterest.com/resource/BaseSearchResource/get/?source_url=%2Fsearch%2Fpins%2F%3Feq%3Dhigurashi%26etslf%3D5966%26len%3D2%26q%3Dhigurashi%2520when%2520they%2520cry%26rs%3Dac&data=%7B%22options%22%3A%7B%22applied_unified_filters%22%3Anull%2C%22appliedProductFilters%22%3A%22---%22%2C%22article%22%3Anull%2C%22auto_correction_disabled%22%3Afalse%2C%22corpus%22%3Anull%2C%22customized_rerank_type%22%3Anull%2C%22domains%22%3Anull%2C%22dynamicPageSizeExpGroup%22%3Anull%2C%22filters%22%3Anull%2C%22journey_depth%22%3Anull%2C%22page_size%22%3Anull%2C%22price_max%22%3Anull%2C%22price_min%22%3Anull%2C%22query_pin_sigs%22%3Anull%2C%22query%22%3A%22higurashi%20when%20they%20cry%22%2C%22redux_normalize_feed%22%3Atrue%2C%22request_params%22%3Anull%2C%22rs%22%3A%22ac%22%2C%22scope%22%3A%22pins%22%2C%22selected_one_bar_modules%22%3Anull%2C%22source_id%22%3Anull%2C%22source_module_id%22%3Anull%2C%22source_url%22%3A%22%2Fsearch%2Fpins%2F%3Feq%3Dhigurashi%26etslf%3D5966%26len%3D2%26q%3Dhigurashi%2520when%2520they%2520cry%26rs%3Dac%22%2C%22top_pin_id%22%3Anull%2C%22top_pin_ids%22%3Anull%7D%2C%22context%22%3A%7B%7D%7D&_=1736116313987
// source_url=%2Fsearch%2Fpins%2F%3Feq%3Dhigurashi%26etslf%3D5966%26len%3D2%26q%3Dhigurashi%2520when%2520they%2520cry%26rs%3Dac
// &data=%7B%22options%22%3A%7B%22applied_unified_filters%22%3Anull%2C%22appliedProductFilters%22%3A%22---%22%2C%22article%22%3Anull%2C%22auto_correction_disabled%22%3Afalse%2C%22corpus%22%3Anull%2C%22customized_rerank_type%22%3Anull%2C%22domains%22%3Anull%2C%22dynamicPageSizeExpGroup%22%3Anull%2C%22filters%22%3Anull%2C%22journey_depth%22%3Anull%2C%22page_size%22%3Anull%2C%22price_max%22%3Anull%2C%22price_min%22%3Anull%2C%22query_pin_sigs%22%3Anull%2C%22query%22%3A%22higurashi%20when%20they%20cry%22%2C%22redux_normalize_feed%22%3Atrue%2C%22request_params%22%3Anull%2C%22rs%22%3A%22ac%22%2C%22scope%22%3A%22pins%22%2C%22selected_one_bar_modules%22%3Anull%2C%22source_id%22%3Anull%2C%22source_module_id%22%3Anull%2C%22source_url%22%3A%22%2Fsearch%2Fpins%2F%3Feq%3Dhigurashi%26etslf%3D5966%26len%3D2%26q%3Dhigurashi%2520when%2520they%2520cry%26rs%3Dac%22%2C%22top_pin_id%22%3Anull%2C%22top_pin_ids%22%3Anull%7D%2C%22context%22%3A%7B%7D%7D
// &_=1736116313987
$source_url = "/search/pins/?q=" . urlencode($search) . "&rs=" . urlencode($search);
$filter = [
"source_url" => $source_url,
"rs" => "typed",
"data" =>
json_encode(
[
"options" => [
"applied_unified_filters" => null,
"appliedProductFilters" => "---",
"article" => null,
"corpus" => null,
"customized_rerank_type" => null,
"domains" => null,
"dynamicPageSizeExpGroup" => null,
"filters" => null,
"journey_depth" => null,
"page_size" => null,
"price_max" => null,
"price_min" => null,
"query_pin_sigs" => null,
"query" => $search,
"redux_normalize_feed" => true,
"request_params" => null,
"rs" => "ac",
"scope" => "pins", // pins, boards, videos,
"selected_one_bar_modules" => null,
"source_id" => null,
"source_module_id" => null,
"source_url" => $source_url,
"top_pin_id" => null,
"top_pin_ids" => null
],
"context" => []
]
),
"_" => substr(str_replace(".", "", (string)microtime(true)), 0, -1)
];
$proxy = $this->backend->get_ip();
$cookies = [];
try{
$json =
$this->get(
$proxy,
"https://ca.pinterest.com/resource/BaseSearchResource/get/",
$filter,
$cookies,
null
);
}catch(Exception $error){
throw new Exception("Failed to fetch JSON");
}
}
$json = json_decode($json, true);
if($json === null){
throw new Exception("Failed to decode JSON");
}
$out = [
"status" => "ok",
"npt" => null,
"image" => []
];
if(
!isset(
$json["resource_response"]
["status"]
)
){
throw new Exception("Unknown API failure");
}
if($json["resource_response"]["status"] != "success"){
$status = "Got non-OK response: " . $json["resource_response"]["status"];
if(
isset(
$json["resource_response"]["message"]
)
){
$status .= " - " . $json["resource_response"]["message"];
}
throw new Exception($status);
}
if(
isset(
$json["resource_response"]["sensitivity"]
["notices"][0]["description"]["text"]
)
){
throw new Exception(
"Pinterest returned a notice: " .
$json["resource_response"]["sensitivity"]["notices"][0]["description"]["text"]
);
}
// get NPT
if(isset($json["resource_response"]["bookmark"])){
$out["npt"] =
$this->backend->store(
json_encode([
"q" => $search,
"bookmark" => $json["resource_response"]["bookmark"],
"cookies" => $cookies
]),
"images",
$proxy
);
}
foreach(
$json
["resource_response"]
["data"]
["results"]
as $item
){
switch($item["type"]){
case "pin":
case "board":
/*
Handle image object
*/
$images = array_values($item["images"]);
$image = &$images[count($images) - 1]; // original
$thumb = &$images[1]; // 236x
$title = [];
if(
isset($item["grid_title"]) &&
trim($item["grid_title"]) != ""
){
$title[] = $item["grid_title"];
}
if(
isset($item["description"]) &&
trim($item["description"]) != ""
){
$title[] = $item["description"];
}
$title = implode(": ", $title);
if(
$title == "" &&
isset($item["board"]["name"]) &&
trim($item["board"]["name"]) != ""
){
$title = $item["board"]["name"];
}
if($title == ""){
$title = null;
}
$out["image"][] = [
"title" => $title,
"source" => [
[
"url" => $image["url"],
"width" => (int)$image["width"],
"height" => (int)$image["height"]
],
[
"url" => $thumb["url"],
"width" => (int)$thumb["width"],
"height" => (int)$thumb["height"]
]
],
"url" =>
$item["link"] === null ?
"https://ca.pinterest.com/pin/" . $item["id"] :
$item["link"]
];
break;
}
}
return $out;
}
}

937
scraper/qwant.php Normal file
View file

@ -0,0 +1,937 @@
<?php
class qwant{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("qwant");
}
public function getfilters($page){
$base = [
"nsfw" => [
"display" => "NSFW",
"option" => [
"yes" => "Yes",
"maybe" => "Maybe",
"no" => "No"
]
],
"country" => [
"display" => "Country",
"option" => [
"en_US" => "United States",
"fr_FR" => "France",
"en_GB" => "Great Britain",
"de_DE" => "Germany",
"it_IT" => "Italy",
"es_AR" => "Argentina",
"en_AU" => "Australia",
"es_ES" => "Spain (es)",
"ca_ES" => "Spain (ca)",
"cs_CZ" => "Czech Republic",
"ro_RO" => "Romania",
"el_GR" => "Greece",
"zh_CN" => "China",
"zh_HK" => "Hong Kong",
"en_NZ" => "New Zealand",
"fr_FR" => "France",
"th_TH" => "Thailand",
"ko_KR" => "South Korea",
"sv_SE" => "Sweden",
"nb_NO" => "Norway",
"da_DK" => "Denmark",
"hu_HU" => "Hungary",
"et_EE" => "Estonia",
"es_MX" => "Mexico",
"es_CL" => "Chile",
"en_CA" => "Canada (en)",
"fr_CA" => "Canada (fr)",
"en_MY" => "Malaysia",
"bg_BG" => "Bulgaria",
"fi_FI" => "Finland",
"pl_PL" => "Poland",
"nl_NL" => "Netherlands",
"pt_PT" => "Portugal",
"de_CH" => "Switzerland (de)",
"fr_CH" => "Switzerland (fr)",
"it_CH" => "Switzerland (it)",
"de_AT" => "Austria",
"fr_BE" => "Belgium (fr)",
"nl_BE" => "Belgium (nl)",
"en_IE" => "Ireland",
"he_IL" => "Israel"
]
]
];
switch($page){
case "web":
$base = array_merge(
$base,
[
"time" => [
"display" => "Time posted",
"option" => [
"any" => "Any time",
"day" => "Past 24 hours",
"week" => "Past week",
"month" => "Past month"
]
],
"extendedsearch" => [
// no display, wont show in interface
"option" => [
"yes" => "Yes",
"no" => "No"
]
]
]
);
break;
case "images":
$base = array_merge(
$base,
[
"time" => [
"display" => "Time posted",
"option" => [
"any" => "Any time",
"day" => "Past 24 hours",
"week" => "Past week",
"month" => "Past month"
]
],
"size" => [
"display" => "Size",
"option" => [
"any" => "Any size",
"large" => "Large",
"medium" => "Medium",
"small" => "Small"
]
],
"color" => [
"display" => "Color",
"option" => [
"any" => "Any color",
"coloronly" => "Color only",
"monochrome" => "Monochrome",
"black" => "Black",
"brown" => "Brown",
"gray" => "Gray",
"white" => "White",
"yellow" => "Yellow",
"orange" => "Orange",
"red" => "Red",
"pink" => "Pink",
"purple" => "Purple",
"blue" => "Blue",
"teal" => "Teal",
"green" => "Green"
]
],
"imagetype" => [
"display" => "Type",
"option" => [
"any" => "Any type",
"animatedgif" => "Animated GIF",
"photo" => "Photograph",
"transparent" => "Transparent"
]
],
"license" => [
"display" => "License",
"option" => [
"any" => "Any license",
"share" => "Non-commercial reproduction and sharing",
"sharecommercially" => "Reproduction and sharing",
"modify" => "Non-commercial reproduction, sharing and modification",
"modifycommercially" => "Reproduction, sharing and modification",
"public" => "Public domain"
]
]
]
);
break;
case "videos":
$base = array_merge(
$base,
[
"order" => [
"display" => "Order by",
"option" => [
"relevance" => "Relevance",
"views" => "Views",
"date" => "Most recent",
]
],
"source" => [
"display" => "Source",
"option" => [
"any" => "Any source",
"youtube" => "YouTube",
"dailymotion" => "Dailymotion",
]
]
]
);
break;
case "news":
$base = array_merge(
$base,
[
"time" => [
"display" => "Time posted",
"option" => [
"any" => "Any time",
"hour" => "Less than 1 hour ago",
"day" => "Past 24 hours",
"week" => "Past week",
"month" => "Past month"
]
],
"order" => [
"display" => "Order by",
"option" => [
"relevance" => "Relevance",
"date" => "Most recent"
]
]
]
);
break;
}
return $base;
}
private function get($proxy, $url, $get = []){
$headers = [
"User-Agent: " . config::USER_AGENT,
"Accept: application/json, text/plain, */*",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Connection: keep-alive",
"Origin: https://www.qwant.com",
"Referer: https://www.qwant.com/",
"Sec-Fetch-Dest: empty",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-site",
"TE: trailers"
];
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
// Bypass HTTP/2 check
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function web($get){
if($get["npt"]){
// get next page data
[$params, $proxy] = $this->backend->get($get["npt"], "web");
$params = json_decode($params, true);
}else{
// get _GET data instead
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
if(strlen($search) > 2048){
throw new Exception("Search term is too long!");
}
$proxy = $this->backend->get_ip();
$params = [
"q" => $search,
"freshness" => $get["time"],
"count" => 10,
"locale" => $get["country"],
"offset" => 0,
"device" => "desktop",
"tgp" => 3,
"safesearch" => 0,
"displayed" => "true"
];
switch($get["nsfw"]){
case "yes": $params["safesearch"] = 0; break;
case "maybe": $params["safesearch"] = 1; break;
case "no": $params["safesearch"] = 2; break;
}
}
/*
$handle = fopen("scraper/qwant_web.json", "r");
$json = fread($handle, filesize("scraper/qwant_web.json"));
fclose($handle);*/
try{
$json =
$this->get(
$proxy,
"https://fdn.qwant.com/v3/search/web",
$params
);
}catch(Exception $error){
throw new Exception("Could not fetch JSON");
}
$json = json_decode($json, true);
if($json === NULL){
throw new Exception("Failed to decode JSON");
}
if(isset($json["data"]["message"][0])){
throw new Exception("Server returned an error:\n" . $json["data"]["message"][0]);
}
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
if(
$json["status"] != "success" &&
$json["data"]["error_code"] === 5
){
// no results
return $out;
}
$this->detect_errors($json);
if(!isset($json["data"]["result"]["items"]["mainline"])){
throw new Exception("Server did not return a result object");
}
// data is OK, parse
// get instant answer
if(
$get["extendedsearch"] == "yes" &&
isset($json["data"]["result"]["items"]["sidebar"][0]["endpoint"])
){
try{
$answer =
$this->get(
$proxy,
"https://api.qwant.com/v3" .
$json["data"]["result"]["items"]["sidebar"][0]["endpoint"],
[]
);
$answer = json_decode($answer, true);
if(
$answer === null ||
$answer["status"] != "success" ||
$answer["data"]["result"] === null
){
throw new Exception();
}
// parse answer
$out["answer"][] = [
"title" => $answer["data"]["result"]["title"],
"description" => [
[
"type" => "text",
"value" => $this->trimdots($answer["data"]["result"]["description"])
]
],
"url" => $answer["data"]["result"]["url"],
"thumb" =>
$answer["data"]["result"]["thumbnail"]["landscape"] == null ?
null :
$this->unshitimage(
$answer["data"]["result"]["thumbnail"]["landscape"],
false
),
"table" => [],
"sublink" => []
];
}catch(Exception $error){
// do nothing in case of failure
}
}
// get word correction
if(isset($json["data"]["query"]["queryContext"]["alteredQuery"])){
$out["spelling"] = [
"type" => "including",
"using" => $json["data"]["query"]["queryContext"]["alteredQuery"],
"correction" => $json["data"]["query"]["queryContext"]["alterationOverrideQuery"]
];
}
// check for next page
if($json["data"]["result"]["lastPage"] === false){
$params["offset"] = $params["offset"] + 10;
$out["npt"] =
$this->backend->store(
json_encode($params),
"web",
$proxy
);
}
// parse results
foreach($json["data"]["result"]["items"]["mainline"] as $item){
switch($item["type"]){ // ignores ads
case "web":
$first_iteration = true;
foreach($item["items"] as $result){
if(isset($result["thumbnailUrl"])){
$thumb = [
"url" => $this->unshitimage($result["thumbnailUrl"]),
"ratio" => "16:9"
];
}else{
$thumb = [
"url" => null,
"ratio" => null
];
}
$sublinks = [];
if(isset($result["links"])){
foreach($result["links"] as $link){
$sublinks[] = [
"title" => $this->trimdots($link["title"]),
"date" => null,
"description" => isset($link["desc"]) ? $this->trimdots($link["desc"]) : null,
"url" => $link["url"]
];
}
}
// detect gibberish results
if(
$first_iteration &&
!isset($result["urlPingSuffix"])
){
throw new Exception("Qwant returned gibberish results");
}
$out["web"][] = [
"title" => $this->trimdots($result["title"]),
"description" => $this->trimdots($result["desc"]),
"url" => $result["url"],
"date" => null,
"type" => "web",
"thumb" => $thumb,
"sublink" => $sublinks,
"table" => []
];
$first_iteration = false;
}
break;
case "images":
foreach($item["items"] as $image){
$out["image"][] = [
"title" => $image["title"],
"source" => [
[
"url" => $image["media"],
"width" => (int)$image["width"],
"height" => (int)$image["height"]
],
[
"url" => $this->unshitimage($image["thumbnail"]),
"width" => $image["thumb_width"],
"height" => $image["thumb_height"]
]
],
"url" => $image["url"]
];
}
break;
case "videos":
foreach($item["items"] as $video){
$out["video"][] = [
"title" => $video["title"],
"description" => null,
"date" => (int)$video["date"],
"duration" => $video["duration"] === null ? null : $video["duration"] / 1000,
"views" => null,
"thumb" =>
$video["thumbnail"] === null ?
[
"url" => null,
"ratio" => null,
] :
[
"url" => $this->unshitimage($video["thumbnail"]),
"ratio" => "16:9",
],
"url" => $video["url"]
];
}
break;
case "related_searches":
foreach($item["items"] as $related){
$out["related"][] = $related["text"];
}
break;
}
}
return $out;
}
public function image($get){
if($get["npt"]){
[$params, $proxy] =
$this->backend->get(
$get["npt"],
"images"
);
$params = json_decode($params, true);
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
$params = [
"t" => "images",
"q" => $search,
"count" => 125,
"locale" => $get["country"],
"offset" => 0, // increment by 125
"device" => "desktop",
"tgp" => 3
];
if($get["time"] != "any"){
$params["freshness"] = $get["time"];
}
foreach(["size", "color", "imagetype", "license"] as $p){
if($get[$p] != "any"){
$params[$p] = $get[$p];
}
}
switch($get["nsfw"]){
case "yes": $params["safesearch"] = 0; break;
case "maybe": $params["safesearch"] = 1; break;
case "no": $params["safesearch"] = 2; break;
}
}
try{
$json = $this->get(
$proxy,
"https://api.qwant.com/v3/search/images",
$params,
);
}catch(Exception $err){
throw new Exception("Failed to get JSON");
}
/*
$handle = fopen("scraper/yandex.json", "r");
$json = fread($handle, filesize("scraper/yandex.json"));
fclose($handle);*/
$json = json_decode($json, true);
if($json === null){
throw new Exception("Failed to decode JSON");
}
$this->detect_errors($json);
if(isset($json["data"]["result"]["items"]["mainline"])){
throw new Exception("Qwant returned gibberish results");
}
$out = [
"status" => "ok",
"npt" => null,
"image" => []
];
if($json["data"]["result"]["lastPage"] === false){
$params["offset"] = $params["offset"] + 125;
$out["npt"] = $this->backend->store(
json_encode($params),
"images",
$proxy
);
}
foreach($json["data"]["result"]["items"] as $image){
$out["image"][] = [
"title" => $this->trimdots($image["title"]),
"source" => [
[
"url" => $image["media"],
"width" => $image["width"],
"height" => $image["height"]
],
[
"url" => $this->unshitimage($image["thumbnail"]),
"width" => $image["thumb_width"],
"height" => $image["thumb_height"]
]
],
"url" => $image["url"]
];
}
return $out;
}
public function video($get){
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$params = [
"t" => "videos",
"q" => $search,
"count" => 50,
"locale" => $get["country"],
"offset" => 0, // dont implement pagination
"device" => "desktop",
"tgp" => 3
];
switch($get["nsfw"]){
case "yes": $params["safesearch"] = 0; break;
case "maybe": $params["safesearch"] = 1; break;
case "no": $params["safesearch"] = 2; break;
}
try{
$json =
$this->get(
$this->backend->get_ip(),
"https://api.qwant.com/v3/search/videos",
$params
);
}catch(Exception $error){
throw new Exception("Could not fetch JSON");
}
/*
$handle = fopen("scraper/yandex-video.json", "r");
$json = fread($handle, filesize("scraper/yandex-video.json"));
fclose($handle);
*/
$json = json_decode($json, true);
if($json === null){
throw new Exception("Could not parse JSON");
}
$this->detect_errors($json);
if(isset($json["data"]["result"]["items"]["mainline"])){
throw new Exception("Qwant returned gibberish results");
}
$out = [
"status" => "ok",
"npt" => null,
"video" => [],
"author" => [],
"livestream" => [],
"playlist" => [],
"reel" => []
];
foreach($json["data"]["result"]["items"] as $video){
if(empty($video["thumbnail"])){
$thumb = [
"url" => null,
"ratio" => null
];
}else{
$thumb = [
"url" => $this->unshitimage($video["thumbnail"], false),
"ratio" => "16:9"
];
}
$duration = (int)$video["duration"];
$out["video"][] = [
"title" => $video["title"],
"description" => $this->limitstrlen($video["desc"]),
"author" => [
"name" => $video["channel"],
"url" => null,
"avatar" => null
],
"date" => (int)$video["date"],
"duration" => $duration === 0 ? null : $duration,
"views" => null,
"thumb" => $thumb,
"url" => preg_replace("/\?syndication=.+/", "", $video["url"])
];
}
return $out;
}
public function news($get){
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$params = [
"t" => "news",
"q" => $search,
"count" => 50,
"locale" => $get["country"],
"offset" => 0, // dont implement pagination
"device" => "desktop",
"tgp" => 3
];
switch($get["nsfw"]){
case "yes": $params["safesearch"] = 0; break;
case "maybe": $params["safesearch"] = 1; break;
case "no": $params["safesearch"] = 2; break;
}
try{
$json =
$this->get(
$this->backend->get_ip(),
"https://api.qwant.com/v3/search/news",
$params
);
}catch(Exception $error){
throw new Exception("Could not fetch JSON");
}
/*
$handle = fopen("scraper/yandex-video.json", "r");
$json = fread($handle, filesize("scraper/yandex-video.json"));
fclose($handle);
*/
$json = json_decode($json, true);
if($json === null){
throw new Exception("Could not parse JSON");
}
$this->detect_errors($json);
if(isset($json["data"]["result"]["items"]["mainline"])){
throw new Exception("Qwant returned gibberish results");
}
$out = [
"status" => "ok",
"npt" => null,
"news" => []
];
foreach($json["data"]["result"]["items"] as $news){
if(empty($news["media"][0]["pict_big"]["url"])){
$thumb = [
"url" => null,
"ratio" => null
];
}else{
$thumb = [
"url" => $this->unshitimage($news["media"][0]["pict_big"]["url"], false),
"ratio" => "16:9"
];
}
$out["news"][] = [
"title" => $news["title"],
"author" => $news["press_name"],
"description" => $this->trimdots($news["desc"]),
"date" => (int)$news["date"],
"thumb" => $thumb,
"url" => $news["url"]
];
}
return $out;
}
private function detect_errors($json){
if(
isset($json["status"]) &&
$json["status"] == "error"
){
if(isset($json["data"]["error_data"]["captchaUrl"])){
throw new Exception("Qwant returned a captcha");
}elseif(isset($json["data"]["error_data"]["error_code"])){
throw new Exception(
"Qwant returned an API error: " .
$json["data"]["error_data"]["error_code"]
);
}
throw new Exception("Qwant returned an API error");
}
}
private function limitstrlen($text){
return explode("\n", wordwrap($text, 300, "\n"))[0];
}
private function trimdots($text){
return trim($text, ". ");
}
private function unshitimage($url, $is_bing = true){
// https://s1.qwant.com/thumbr/0x0/8/d/f6de4deb2c2b12f55d8bdcaae576f9f62fd58a05ec0feeac117b354d1bf5c2/th.jpg?u=https%3A%2F%2Fwww.bing.com%2Fth%3Fid%3DOIP.vvDWsagzxjoKKP_rOqhwrQAAAA%26w%3D160%26h%3D160%26c%3D7%26pid%3D5.1&q=0&b=1&p=0&a=0
parse_str(parse_url($url)["query"], $parts);
if($is_bing){
$parse = parse_url($parts["u"]);
parse_str($parse["query"], $parts);
return "https://" . $parse["host"] . "/th?id=" . urlencode($parts["id"]);
}
return $parts["u"];
}
}

512
scraper/sc.php Normal file
View file

@ -0,0 +1,512 @@
<?php
class sc{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("sc");
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
}
public function getfilters($page){
return [
"type" => [
"display" => "Type",
"option" => [
"any" => "Any type",
"track" => "Tracks",
"author" => "People",
"album" => "Albums",
"playlist" => "Playlists",
"goplus" => "Go+ Tracks"
]
]
];
}
private function get($proxy, $url, $get = [], $web_req = false){
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
// use http2
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
if($web_req === false){
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"Referer: https://soundcloud.com/",
"Origin: https://soundcloud.com",
"DNT: 1",
"Connection: keep-alive",
"Sec-Fetch-Dest: empty",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-site",
"Priority: u=1"]
);
}else{
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: cross-site",
"Priority: u=1",
"TE: trailers"]
);
}
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function music($get, $last_attempt = false){
if($get["npt"]){
[$params, $proxy] = $this->backend->get($get["npt"], "music");
$params = json_decode($params, true);
$url = $params["url"];
unset($params["url"]);
}else{
// normal search:
// https://api-v2.soundcloud.com/search?q=freddie%20dredd&variant_ids=&facet=model&user_id=351062-302234-707916-795081&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en
// soundcloud go+ search:
// https://api-v2.soundcloud.com/search/tracks?q=freddie%20dredd&variant_ids=&filter.content_tier=SUB_HIGH_TIER&facet=genre&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en
// tracks search:
// https://api-v2.soundcloud.com/search/tracks?q=freddie%20dredd&variant_ids=&facet=genre&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en
// users search:
// https://api-v2.soundcloud.com/search/users?q=freddie%20dredd&variant_ids=&facet=place&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en
// albums search:
// https://api-v2.soundcloud.com/search/albums?q=freddie%20dredd&variant_ids=&facet=genre&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en
// playlists search:
// https://api-v2.soundcloud.com/search/playlists_without_albums?q=freddie%20dredd&variant_ids=&facet=genre&user_id=630591-269800-703400-765403&client_id=iMxZgT5mfGstBj8GWJbYMvpzelS8ne0E&limit=20&offset=0&linked_partitioning=1&app_version=1693487844&app_locale=en
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$type = $get["type"];
$proxy = $this->backend->get_ip();
$token = $this->get_token($proxy);
switch($type){
case "any":
$url = "https://api-v2.soundcloud.com/search";
$params = [
"q" => $search,
"variant_ids" => "",
"facet" => "model",
"client_id" => $token,
"limit" => 20,
"offset" => 0,
"linked_partitioning" => 1,
"app_version" => 1713542117,
"app_locale" => "en"
];
break;
case "track":
$url = "https://api-v2.soundcloud.com/search/tracks";
$params = [
"q" => $search,
"variant_ids" => "",
"facet_genre" => "",
"client_id" => $token,
"limit" => 20,
"offset" => 0,
"linked_partitioning" => 1,
"app_version" => 1713542117,
"app_locale" => "en"
];
break;
case "author":
$url = "https://api-v2.soundcloud.com/search/users";
$params = [
"q" => $search,
"variant_ids" => "",
"facet" => "place",
"client_id" => $token,
"limit" => 20,
"offset" => 0,
"linked_partitioning" => 1,
"app_version" => 1713542117,
"app_locale" => "en"
];
break;
case "album":
$url = "https://api-v2.soundcloud.com/search/albums";
$params = [
"q" => $search,
"variant_ids" => "",
"facet" => "genre",
"client_id" => $token,
"limit" => 20,
"offset" => 0,
"linked_partitioning" => 1,
"app_version" => 1713542117,
"app_locale" => "en"
];
break;
case "playlist":
$url = "https://api-v2.soundcloud.com/search/playlists_without_albums";
$params = [
"q" => $search,
"variant_ids" => "",
"facet" => "genre",
"client_id" => $token,
"limit" => 20,
"offset" => 0,
"linked_partitioning" => 1,
"app_version" => 1713542117,
"app_locale" => "en"
];
break;
case "goplus":
$url = "https://api-v2.soundcloud.com/search/tracks";
$params = [
"q" => $search,
"variant_ids" => "",
"filter.content_tier" => "SUB_HIGH_TIER",
"facet" => "genre",
"client_id" => $token,
"limit" => 20,
"offset" => 0,
"linked_partitioning" => 1,
"app_version" => 1713542117,
"app_locale" => "en"
];
break;
}
}
try{
$json = $this->get($proxy, $url, $params);
}catch(Exception $error){
throw new Exception("Failed to fetch JSON");
}
/*
$handle = fopen("scraper/soundcloud.json", "r");
$json = fread($handle, filesize("scraper/soundcloud.json"));
fclose($handle);
*/
$json = json_decode($json, true);
if($json === null){
if($last_attempt === true){
throw new Exception("Fetched an invalid token (please report!!)");
}
// token might've expired, get a new one and re-try search
$this->get_token($proxy);
return $this->music($get, true);
}
$out = [
"status" => "ok",
"npt" => null,
"song" => [],
"playlist" => [],
"album" => [],
"podcast" => [],
"author" => [],
"user" => []
];
/*
Get next page
*/
if(isset($json["next_href"])){
$params["query_urn"] = $json["query_urn"];
$params["offset"] = $params["offset"] + 20;
$params["url"] = $url; // we will remove this later
$out["npt"] =
$this->backend->store(
json_encode($params),
"music",
$proxy
);
}
/*
Scrape items
*/
foreach($json["collection"] as $item){
switch($item["kind"]){
case "user":
// parse author
$out["author"][] = [
"title" => $item["username"],
"followers" => $item["followers_count"],
"description" => trim($item["track_count"] . " songs. " . $this->limitstrlen($item["description"])),
"thumb" => [
"url" => $item["avatar_url"],
"ratio" => "1:1"
],
"url" => $item["permalink_url"]
];
break;
case "playlist":
// parse playlist
$description = [];
$count = 0;
foreach($item["tracks"] as $song){
$count++;
if(!isset($song["title"])){
continue;
}
$description[] = $song["title"];
}
if(count($description) !== 0){
$description = trim($count . " songs. " . implode(", ", $description));
}else{
$description = "";
}
if(
isset($item["artwork_url"]) &&
!empty($item["artwork_url"])
){
$thumb = [
"ratio" => "1:1",
"url" => $item["artwork_url"]
];
}elseif(
isset($item["tracks"][0]["artwork_url"]) &&
!empty($item["tracks"][0]["artwork_url"])
){
$thumb = [
"ratio" => "1:1",
"url" => $item["tracks"][0]["artwork_url"]
];
}else{
$thumb = [
"ratio" => null,
"url" => null
];
}
$out["playlist"][] = [
"title" => $item["title"],
"description" => $this->limitstrlen($description),
"author" => [
"name" => $item["user"]["username"],
"url" => $item["user"]["permalink_url"],
"avatar" => $item["user"]["avatar_url"]
],
"thumb" => $thumb,
"date" => strtotime($item["created_at"]),
"duration" => $item["duration"] / 1000,
"url" => $item["permalink_url"]
];
break;
case "track":
if(stripos($item["monetization_model"], "TIER") === false){
$stream = [
"endpoint" => "sc",
"url" =>
$item["media"]["transcodings"][0]["url"] .
"?client_id=" . $token .
"&track_authorization=" .
$item["track_authorization"]
];
}else{
$stream = [
"endpoint" => null,
"url" => null
];
}
// parse track
$out["song"][] = [
"title" => $item["title"],
"description" => $item["description"] == "" ? null : $this->limitstrlen($item["description"]),
"url" => $item["permalink_url"],
"views" => $item["playback_count"],
"author" => [
"name" => $item["user"]["username"],
"url" => $item["user"]["permalink_url"],
"avatar" => $item["user"]["avatar_url"]
],
"thumb" => [
"ratio" => "1:1",
"url" => $item["artwork_url"]
],
"date" => strtotime($item["created_at"]),
"duration" => (int)$item["full_duration"] / 1000,
"stream" => $stream
];
break;
}
}
return $out;
}
public function get_token($proxy){
$token = apcu_fetch("sc_token");
if($token !== false){
return $token;
}
// search through all javascript components on the main page
try{
$html =
$this->get(
$proxy,
"https://soundcloud.com",
[],
true
);
}catch(Exception $error){
throw new Exception("Failed to fetch front page");
}
$this->fuckhtml->load($html);
$scripts =
$this->fuckhtml
->getElementsByTagName(
"script"
);
foreach($scripts as $script){
if(
!isset($script["attributes"]["src"]) ||
strpos($script["attributes"]["src"], "sndcdn.com") === false
){
continue;
}
try{
$js =
$this->get(
$proxy,
$script["attributes"]["src"],
[]
);
}catch(Exception $error){
throw new Exception("Failed to fetch search token");
}
preg_match(
'/client_id=([^"]+)/',
$js,
$token
);
if(isset($token[1])){
apcu_store("sc_token", $token[1]);
return $token[1];
break;
}
}
throw new Exception("Did not find a Soundcloud token in the Javascript blobs");
}
private function limitstrlen($text){
return
explode(
"\n",
wordwrap(
str_replace(
["\n\r", "\r\n", "\n", "\r"],
" ",
$text
),
300,
"\n"
),
2
)[0];
}
}

668
scraper/solofield.php Normal file
View file

@ -0,0 +1,668 @@
<?php
class solofield{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("solofield");
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
}
public function getfilters($page){
return [
"nsfw" => [
"display" => "NSFW",
"option" => [
"yes" => "Yes",
"no" => "No",
]
]
];
}
private function get($proxy, $url, $get = []){
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"Referer: https://solofield.net",
"DNT: 1",
"Connection: keep-alive",
"Cookie: cross-site-cookie=name; lno=35842050",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: same-origin",
"Sec-Fetch-User: ?1"]
);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function web($get){
if($get["npt"]){
[$query, $proxy] = $this->backend->get($get["npt"], "web");
try{
$html =
$this->get(
$proxy,
"https://solofield.net/search?" . $query,
[]
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}else{
$proxy = $this->backend->get_ip();
try{
$html =
$this->get(
$proxy,
"https://solofield.net/search",
[
"q" => $get["s"],
"ie" => "UTF-8",
"oe" => "UTF-8",
"hl" => "ja", // changing this doesnt do anything
"lr" => "lang_ja", // same here
//"ls" => "", // ??
"f" => ($get["nsfw"] == "yes" ? "off" : "on")
]
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
// check for errors and load the result div
if($this->error_and_load($html)){
return $out;
}
$items =
$this->fuckhtml
->getElementsByClassName(
"g0",
"li"
);
foreach($items as $item){
$this->fuckhtml->load($item);
$title_tag =
$this->fuckhtml
->getElementsByClassName(
"r",
"h3"
);
if(count($title_tag) === 0){
continue;
}
$this->fuckhtml->load($title_tag[0]);
$link =
$this->fuckhtml
->getTextContent(
$this->fuckhtml
->getElementsByTagName(
"a"
)[0]
["attributes"]
["href"]
);
$this->fuckhtml->load($item);
$thumb =
$this->fuckhtml
->getElementsByClassName(
"webshot",
"img"
);
if(count($thumb) !== 0){
$uri =
$this->fuckhtml
->getTextContent(
$thumb[0]
["attributes"]
["src"]
);
if(stripos($uri, "now_printing") === false){
$thumb = [
"ratio" => "1:1",
"url" =>
"https://solofield.net" .
$this->fuckhtml
->getTextContent(
$thumb[0]
["attributes"]
["src"]
)
];
}else{
$thumb = [
"ratio" => null,
"url" => null
];
}
}else{
$thumb = [
"ratio" => null,
"url" => null
];
}
$out["web"][] = [
"title" =>
$this->fuckhtml
->getTextContent(
$title_tag[0]
),
"description" =>
$this->fuckhtml
->getTextContent(
$this->fuckhtml
->getElementsByClassName(
"s",
"div"
)[0]
),
"url" => $link,
"date" => null,
"type" => "web",
"thumb" => $thumb,
"sublink" => [],
"table" => []
];
}
// get next page
$this->get_npt($html, $proxy, $out, "web");
return $out;
}
public function image($get){
// no pagination
$html =
$this->get(
$this->backend->get_ip(),
"https://solofield.net/isearch",
[
"q" => $get["s"],
"ie" => "UTF-8",
"oe" => "UTF-8",
"hl" => "ja", // changing this doesnt do anything
//"lr" => "lang_ja", // same here
"ls" => "", // ??
"f" => ($get["nsfw"] == "yes" ? "off" : "on")
]
);
$out = [
"status" => "ok",
"npt" => null,
"image" => []
];
// check for errors and load the result div
if($this->error_and_load($html)){
return $out;
}
$images =
$this->fuckhtml
->getElementsByTagName(
"li"
);
foreach($images as $image){
$this->fuckhtml->load($image);
$img =
$this->fuckhtml
->getElementsByTagName(
"img"
);
if(count($img) === 0){
// ?? invalid
continue;
}
$img = $img[0];
$size =
explode(
"x",
$this->fuckhtml
->getTextContent(
$image
),
2
);
$size = [
(int)trim($size[0]), // width
(int)trim($size[1]) // height
];
$out["image"][] = [
"title" => null,
"source" => [
[
"url" =>
"https://solofield.net/" .
$this->fuckhtml
->getTextContent(
$img["attributes"]["src"]
),
"width" => $size[0],
"height" => $size[1]
]
],
"url" =>
$this->fuckhtml
->getTextContent(
$this->fuckhtml
->getElementsByTagName(
"a"
)[0]
["attributes"]
["href"]
)
];
}
return $out;
}
public function video($get){
if($get["npt"]){
[$query, $proxy] = $this->backend->get($get["npt"], "videos");
try{
$html =
$this->get(
$proxy,
"https://solofield.net/vsearch?" . $query,
[]
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}else{
$proxy = $this->backend->get_ip();
try{
$html =
$this->get(
$proxy,
"https://solofield.net/vsearch",
[
"q" => $get["s"],
"ie" => "UTF-8",
"oe" => "UTF-8",
"hl" => "ja", // changing this doesnt do anything
//"lr" => "lang_ja", // same here
"ls" => "", // ??
"f" => ($get["nsfw"] == "yes" ? "off" : "on")
]
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
}
$out = [
"status" => "ok",
"npt" => null,
"video" => [],
"author" => [],
"livestream" => [],
"playlist" => [],
"reel" => []
];
// check for errors and load the result div
if($this->error_and_load($html)){
return $out;
}
$items =
$this->fuckhtml
->getElementsByTagName(
"li"
);
foreach($items as $item){
$this->fuckhtml->load($item);
$as =
$this->fuckhtml
->getElementsByTagName(
"a"
);
if(count($as) === 0){
continue;
}
$thumb =
$this->fuckhtml
->getElementsByTagName(
"img"
);
if(count($thumb) !== 0){
$thumb = [
"ratio" => "16:9",
"url" =>
"https://solofield.net/" .
$thumb[0]
["attributes"]
["src"]
];
}else{
$thumb = [
"ratio" => null,
"url" => null
];
}
$date =
$this->fuckhtml
->getElementsByAttributeValue(
"style",
"font-size: 10px;",
"span"
);
if(count($date) !== 0){
$date =
$this->unfuckdate(
$this->fuckhtml
->getTextContent(
$date[0]
)
);
}else{
$date = null;
}
$center_td =
$this->fuckhtml
->getElementsByAttributeValue(
"align",
"center",
"td"
);
if(count($center_td) === 2){
$duration =
$this->fuckhtml
->getTextContent(
$this->hms2int(
$center_td[0]
)
);
}else{
$duration = null;
}
$out["video"][] = [
"title" =>
$this->fuckhtml
->getTextContent(
$as[1]
),
"description" => null,
"author" => [
"name" => null,
"url" => null,
"avatar" => null
],
"date" => $date,
"duration" => $duration,
"views" => null,
"thumb" => $thumb,
"url" =>
$this->fuckhtml
->getTextContent(
$as[0]
["attributes"]
["href"]
)
];
}
// get next page
$this->get_npt($html, $proxy, $out, "videos");
return $out;
}
private function get_npt($html, $proxy, &$out, $type){
// get next page
$this->fuckhtml->load($html);
$pjs =
$this->fuckhtml
->getElementById(
"pjs"
);
if($pjs){
$alnk =
$this->fuckhtml
->getElementsByClassName(
"alnk",
"span"
);
foreach($alnk as $lnk){
if(
stripos(
$this->fuckhtml
->getTextContent(
$lnk
),
"Next"
) !== false
){
$this->fuckhtml->load($lnk);
$out["npt"] =
$this->backend->store(
parse_url(
$this->fuckhtml
->getElementsByTagName(
"a"
)[0]
["attributes"]
["href"],
PHP_URL_QUERY
),
$type,
$proxy
);
}
}
}
}
private function error_and_load($html){
if(strlen($html) === 0){
throw new Exception("Solofield blocked the request IP");
}
$this->fuckhtml->load($html);
$list =
$this->fuckhtml
->getElementById(
"list",
"div"
);
if($list === false){
$nosearch =
$this->fuckhtml
->getElementById(
"nosearch",
"div"
);
if($nosearch){
return true;
}
throw new Exception("Failed to grep search list");
}
$this->fuckhtml->load($list);
return false;
}
private function unfuckdate($date){
return
strtotime(
rtrim(
preg_replace(
'/[^0-9]+/',
"-",
explode(
":",
$date,
2
)[1]
),
"-"
)
);
}
private function hms2int($time){
$parts = explode(":", $time, 3);
$time = 0;
if(count($parts) === 3){
// hours
$time = $time + ((int)$parts[0] * 3600);
array_shift($parts);
}
if(count($parts) === 2){
// minutes
$time = $time + ((int)$parts[0] * 60);
array_shift($parts);
}
// seconds
$time = $time + (int)$parts[0];
return $time;
}
}

726
scraper/spotify.php Normal file
View file

@ -0,0 +1,726 @@
<?php
class spotify{
private const req_web = 0;
private const req_api = 1;
private const req_clientid = 2;
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("spotify");
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
}
public function getfilters($page){
return [
"category" => [
"display" => "Category",
"option" => [
"any" => "All (no pagination)",
"audiobooks" => "Audiobooks",
"tracks" => "Songs",
"artists" => "Artists",
"playlists" => "Playlists",
"albums" => "Albums",
"podcastAndEpisodes" => "Podcasts & Shows (no pagination)",
"episodes" => "Episodes",
"users" => "Profiles"
]
]
];
}
private function get($proxy, $url, $get = [], $reqtype = self::req_web, $bearer = null, $token = null){
$curlproc = curl_init();
switch($reqtype){
case self::req_api:
$headers = [
"User-Agent: " . config::USER_AGENT,
"Accept: application/json",
"Accept-Language: en",
"app-platform: WebPlayer",
"authorization: Bearer {$bearer}",
"client-token: {$token}",
"content-type: application/json;charset=UTF-8",
"Origin: https://open.spotify.com",
"Referer: https://open.spotify.com/",
"DNT: 1",
"Connection: keep-alive",
"Sec-Fetch-Dest: empty",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-site",
"spotify-app-version: 1.2.27.93.g7aee53d4",
"TE: trailers"
];
break;
case self::req_web:
$headers = [
"User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: cross-site"
];
break;
case self::req_clientid:
$get = json_encode($get);
curl_setopt($curlproc, CURLOPT_POST, true);
curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get);
$headers = [
"User-Agent:" . config::USER_AGENT,
"Accept: application/json",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip, deflate, br",
"Referer: https://open.spotify.com/",
"content-type: application/json",
"Content-Length: " . strlen($get),
"Origin: https://open.spotify.com",
"DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive",
"Sec-Fetch-Dest: empty",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-site",
"TE: trailers"
];
break;
}
if($reqtype !== self::req_clientid){
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function music($get){
$search = $get["s"];
$ip = $this->backend->get_ip();
$category = $get["category"];
/*
audiobooks first and second page decoded
https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchAudiobooks&variables={"searchTerm":"freddie+dredd","offset":0,"limit":30,"numberOfTopResults":20,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"8758e540afdba5afa3c5246817f6bd31d86a15b3f5666c363dd017030f35d785"}}
https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchAudiobooks&variables={"searchTerm":"freddie+dredd","offset":30,"limit":30,"numberOfTopResults":20,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"8758e540afdba5afa3c5246817f6bd31d86a15b3f5666c363dd017030f35d785"}}
*/
/*
songs
https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchTracks&variables={"searchTerm":"asmr","offset":0,"limit":100,"numberOfTopResults":20,"includeAudiobooks":false}&extensions={"persistedQuery":{"version":1,"sha256Hash":"16c02d6304f5f721fc2eb39dacf2361a4543815112506a9c05c9e0bc9733a679"}}
https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchTracks&variables={"searchTerm":"asmr","offset":100,"limit":100,"numberOfTopResults":20,"includeAudiobooks":false}&extensions={"persistedQuery":{"version":1,"sha256Hash":"16c02d6304f5f721fc2eb39dacf2361a4543815112506a9c05c9e0bc9733a679"}}
*/
/*
artists
https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchArtists&variables={"searchTerm":"asmr","offset":0,"limit":30,"numberOfTopResults":20,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"b8840daafdda9a9ceadb7c5774731f63f9eca100445d2d94665f2dc58b45e2b9"}}
https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchArtists&variables={"searchTerm":"asmr","offset":30,"limit":23,"numberOfTopResults":20,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"b8840daafdda9a9ceadb7c5774731f63f9eca100445d2d94665f2dc58b45e2b9"}}
https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchArtists&variables={"searchTerm":"asmr","offset":53,"limit":30,"numberOfTopResults":20,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"b8840daafdda9a9ceadb7c5774731f63f9eca100445d2d94665f2dc58b45e2b9"}}
*/
/*
playlists
https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchPlaylists&variables={"searchTerm":"asmr","offset":0,"limit":30,"numberOfTopResults":20,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"19b4143a0500ccec189ca0f4a0316bc2c615ecb51ce993ba4d7d08afd1d87aa4"}}
https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchPlaylists&variables={"searchTerm":"asmr","offset":30,"limit":3,"numberOfTopResults":20,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"19b4143a0500ccec189ca0f4a0316bc2c615ecb51ce993ba4d7d08afd1d87aa4"}}
*/
/*
albums
https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchAlbums&variables={"searchTerm":"asmr","offset":33,"limit":30,"numberOfTopResults":20,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"e93b13cda461482da2940467eb2beed9368e9bb2fff37df3fb6633fc61271a27"}}
https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchAlbums&variables={"searchTerm":"asmr","offset":33,"limit":30,"numberOfTopResults":20,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"e93b13cda461482da2940467eb2beed9368e9bb2fff37df3fb6633fc61271a27"}}
*/
/*
podcasts & shows (contains authors, no pagination)
https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchFullEpisodes&variables={"searchTerm":"asmr","offset":0,"limit":30}&extensions={"persistedQuery":{"version":1,"sha256Hash":"9f996251c9781fabce63f1a9980b5287ea33bc5e8c8953d0c4689b09936067a1"}}
*/
/*
episodes
https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchDesktop&variables={"searchTerm":"asmr","offset":0,"limit":10,"numberOfTopResults":5,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"da03293d92a2cfc5e24597dcdc652c0ad135e1c64a78fddbf1478a7e096bea44"}}
??? https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchFullEpisodes&variables={"searchTerm":"asmr","offset":60,"limit":30}&extensions={"persistedQuery":{"version":1,"sha256Hash":"9f996251c9781fabce63f1a9980b5287ea33bc5e8c8953d0c4689b09936067a1"}}
*/
/*
profiles
https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchUsers&variables={"searchTerm":"asmr","offset":0,"limit":30,"numberOfTopResults":20,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"02026f48ab5001894e598904079b620ebc64f2d53b55ca20c3858abd3a46c5fb"}}
https://api-partner.spotify.com/pathfinder/v1/query?operationName=searchUsers&variables={"searchTerm":"asmr","offset":30,"limit":30,"numberOfTopResults":20,"includeAudiobooks":true}&extensions={"persistedQuery":{"version":1,"sha256Hash":"02026f48ab5001894e598904079b620ebc64f2d53b55ca20c3858abd3a46c5fb"}}
*/
// get HTML
try{
$html =
$this->get(
$ip,
"https://open.spotify.com/search/" .
rawurlencode($search) .
($category != "any" ? "/" . $category : ""),
[]
);
}catch(Exception $error){
throw new Exception("Failed to get initial search page");
}
// grep bearer and client ID
$this->fuckhtml->load($html);
$script =
$this->fuckhtml
->getElementById(
"session",
"script"
);
if($script === null){
throw new Exception("Failed to grep bearer token");
}
$script =
json_decode(
$script["innerHTML"],
true
);
$bearer = $script["accessToken"];
$client_id = $script["clientId"];
// hit client ID endpoint
try{
$token =
json_decode(
$this->get(
$ip,
"https://clienttoken.spotify.com/v1/clienttoken",
[ // !! that shit must be sent as json data
"client_data" => [
"client_id" => $client_id,
"client_version" => "1.2.27.93.g7aee53d4",
"js_sdk_data" => [
"device_brand" => "unknown",
"device_id" => "4c7ca20117ca12288ea8fc7118a9118c",
"device_model" => "unknown",
"device_name" => "computer",
"os" => "windows",
"os_version" => "NT 10.0"
]
]
],
self::req_clientid
),
true
);
}catch(Exception $error){
throw new Exception("Failed to fetch token");
}
if($token === null){
throw new Exception("Failed to decode token");
}
$token = $token["granted_token"]["token"];
try{
switch($get["option"]){
case "any":
$variables = [
"searchTerm" => $search,
"offset" => 0,
"limit" => 10,
"numberOfTopResults" => 5,
"includeAudiobooks" => true
];
break;
case "audiobooks":
break;
}
$payload =
$this->get(
$ip,
"https://api-partner.spotify.com/pathfinder/v1/query",
[
"operationName" => "searchDesktop",
"variables" =>
json_encode(
[
"searchTerm" => $search,
"offset" => 0,
"limit" => 10,
"numberOfTopResults" => 5,
"includeAudiobooks" => true
]
),
"extensions" =>
json_encode(
[
"persistedQuery" => [
"version" => 1,
"sha256Hash" => "21969b655b795601fb2d2204a4243188e75fdc6d3520e7b9cd3f4db2aff9591e" // ?
]
]
)
],
self::req_api,
$bearer,
$token
);
}catch(Exception $error){
throw new Exception("Failed to fetch JSON results");
}
if($payload == "Token expired"){
throw new Exception("Grepped spotify token has expired");
}
$payload = json_decode($payload, true);
if($payload === null){
throw new Exception("Failed to decode JSON results");
}
//$payload = json_decode(file_get_contents("scraper/spotify.json"), true);
$out = [
"status" => "ok",
"npt" => null,
"song" => [],
"playlist" => [],
"album" => [],
"podcast" => [],
"author" => [],
"user" => []
];
// get songs
foreach($payload["data"]["searchV2"]["tracksV2"]["items"] as $result){
if(isset($result["item"])){
$result = $result["item"];
}
if(isset($result["data"])){
$result = $result["data"];
}
[$artist, $artist_link] = $this->get_artists($result["artists"]);
$out["song"][] = [
"title" => $result["name"],
"description" => null,
"url" => "https://open.spotify.com/track/" . $result["id"],
"views" => null,
"author" => [
"name" => $artist,
"url" => $artist_link,
"avatar" => null
],
"thumb" => $this->get_thumb($result["albumOfTrack"]["coverArt"]),
"date" => null,
"duration" => $result["duration"]["totalMilliseconds"] / 1000,
"stream" => [
"endpoint" => "spotify",
"url" => "track." . $result["id"]
]
];
}
// get playlists
foreach($payload["data"]["searchV2"]["playlists"]["items"] as $playlist){
if(isset($playlist["data"])){
$playlist = $playlist["data"];
}
$avatar = $this->get_thumb($playlist["ownerV2"]["data"]["avatar"]);
$out["playlist"][] = [
"title" => $playlist["name"],
"description" => null,
"author" => [
"name" => $playlist["ownerV2"]["data"]["name"],
"url" =>
"https://open.spotify.com/user/" .
explode(
":",
$playlist["ownerV2"]["data"]["uri"],
3
)[2],
"avatar" => $avatar["url"]
],
"thumb" => $this->get_thumb($playlist["images"]["items"][0]),
"date" => null,
"duration" => null,
"url" =>
"https://open.spotify.com/playlist/" .
explode(
":",
$playlist["uri"],
3
)[2]
];
}
// get albums
foreach($payload["data"]["searchV2"]["albums"]["items"] as $album){
if(isset($album["data"])){
$album = $album["data"];
}
[$artist, $artist_link] = $this->get_artists($album["artists"]);
$out["album"][] = [
"title" => $album["name"],
"description" => null,
"author" => [
"name" => $artist,
"url" => $artist_link,
"avatar" => null
],
"thumb" => $this->get_thumb($album["coverArt"]),
"date" => mktime(0, 0, 0, 0, 32, $album["date"]["year"]),
"duration" => null,
"url" =>
"https://open.spotify.com/album/" .
explode(
":",
$album["uri"],
3
)[2]
];
}
// get podcasts
foreach($payload["data"]["searchV2"]["podcasts"]["items"] as $podcast){
if(isset($podcast["data"])){
$podcast = $podcast["data"];
}
$description = [];
foreach($podcast["topics"]["items"] as $subject){
$description[] = $subject["title"];
}
$description = implode(", ", $description);
if($description == ""){
$description = null;
}
$out["podcast"][] = [
"title" => $podcast["name"],
"description" => $description,
"author" => [
"name" => $podcast["publisher"]["name"],
"url" => null,
"avatar" => null
],
"thumb" => $this->get_thumb($podcast["coverArt"]),
"date" => null,
"duration" => null,
"url" =>
"https://open.spotify.com/show/" .
explode(
":",
$podcast["uri"],
3
)[2],
"stream" => [
"endpoint" => null,
"url" => null
]
];
}
// get audio books (put in podcasts)
foreach($payload["data"]["searchV2"]["audiobooks"]["items"] as $podcast){
if(isset($podcast["data"])){
$podcast = $podcast["data"];
}
$description = [];
foreach($podcast["topics"]["items"] as $subject){
$description[] = $subject["title"];
}
$description = implode(", ", $description);
if($description == ""){
$description = null;
}
$authors = [];
foreach($podcast["authors"] as $author){
$authors[] = $author["name"];
}
$authors = implode(", ", $authors);
if($authors == ""){
$authors = null;
}
$uri =
explode(
":",
$podcast["uri"],
3
)[2];
$out["podcast"][] = [
"title" => $podcast["name"],
"description" => $description,
"author" => [
"name" => $authors,
"url" => null,
"avatar" => null
],
"thumb" => $this->get_thumb($podcast["coverArt"]),
"date" => strtotime($podcast["publishDate"]["isoString"]),
"duration" => null,
"url" => "https://open.spotify.com/show/" . $uri,
"stream" => [
"endpoint" => "spotify",
"url" => "episode." . $uri
]
];
}
// get episodes (and place them in podcasts)
foreach($payload["data"]["searchV2"]["episodes"]["items"] as $podcast){
if(isset($podcast["data"])){
$podcast = $podcast["data"];
}
$out["podcast"][] = [
"title" => $podcast["name"],
"description" => $this->limitstrlen($podcast["description"]),
"author" => [
"name" =>
isset(
$podcast["podcastV2"]["data"]["publisher"]["name"]
) ?
$podcast["podcastV2"]["data"]["publisher"]["name"]
: null,
"url" => null,
"avatar" => null
],
"thumb" => $this->get_thumb($podcast["coverArt"]),
"date" => strtotime($podcast["releaseDate"]["isoString"]),
"duration" => $podcast["duration"]["totalMilliseconds"] / 1000,
"url" =>
"https://open.spotify.com/show/" .
explode(
":",
$podcast["uri"],
3
)[2],
"stream" => [
"endpoint" => null,
"url" => null
]
];
}
// get authors
foreach($payload["data"]["searchV2"]["artists"]["items"] as $user){
if(isset($user["data"])){
$user = $user["data"];
}
$avatar = $this->get_thumb($user["visuals"]["avatarImage"]);
$out["author"][] = [
"title" =>
(
$user["profile"]["verified"] === true ?
"" : ""
) .
$user["profile"]["name"],
"followers" => null,
"description" => null,
"thumb" => $avatar,
"url" =>
"https://open.spotify.com/artist/" .
explode(
":",
$user["uri"],
3
)[2]
];
}
// get users
foreach($payload["data"]["searchV2"]["users"]["items"] as $user){
if(isset($user["data"])){
$user = $user["data"];
}
$avatar = $this->get_thumb($user["avatar"]);
$out["user"][] = [
"title" => $user["displayName"] . " (@{$user["id"]})",
"followers" => null,
"description" => null,
"thumb" => $avatar,
"url" => "https://open.spotify.com/user/" . $user["id"]
];
}
return $out;
}
private function get_artists($artists){
$artist_out = [];
foreach($artists["items"] as $artist){
$artist_out[] = $artist["profile"]["name"];
}
$artist_out =
implode(", ", $artist_out);
if($artist_out == ""){
return [null, null];
}
$artist_link =
$artist === null ?
null :
"https://open.spotify.com/artist/" .
explode(
":",
$artists["items"][0]["uri"]
)[2];
return [$artist_out, $artist_link];
}
private function get_thumb($cover){
$thumb_out = null;
if($cover !== null){
foreach($cover["sources"] as $thumb){
if(
$thumb_out === null ||
(int)$thumb["width"] > $thumb_out["width"]
){
$thumb_out = $thumb;
}
}
}
if($thumb_out === null){
return [
"url" => null,
"ratio" => null
];
}else{
return [
"url" => $thumb_out["url"],
"ratio" => "1:1"
];
}
}
private function limitstrlen($text){
return
explode(
"\n",
wordwrap(
str_replace(
["\n\r", "\r\n", "\n", "\r"],
" ",
$text
),
300,
"\n"
),
2
)[0];
}
}

1579
scraper/startpage.php Normal file

File diff suppressed because it is too large Load diff

257
scraper/vsco.php Normal file
View file

@ -0,0 +1,257 @@
<?php
class vsco{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("vsco");
}
public function getfilters($page){
return [];
}
private function get($proxy, $url, $get = [], $bearer = null){
$curlproc = curl_init();
if($get !== []){
$get_tmp = http_build_query($get);
$url .= "?" . $get_tmp;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
if($bearer === null){
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: same-origin",
"Sec-Fetch-User: ?1",
"Priority: u=0, i",
"TE: trailers"]
);
}else{
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: */*",
"Accept-Language: en-US",
"Accept-Encoding: gzip",
"Referer: https://vsco.co/search/images/" . urlencode($get["query"]),
"authorization: Bearer " . $bearer,
"content-type: application/json",
"x-client-build: 1",
"x-client-platform: web",
"DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive",
"Sec-Fetch-Dest: empty",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-origin",
"Priority: u=0",
"TE: trailers"]
);
}
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
// http2 bypass
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function image($get){
if($get["npt"]){
[$data, $proxy] =
$this->backend->get(
$get["npt"], "images"
);
$data = json_decode($data, true);
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
// get bearer token
try{
$html =
$this->get(
$proxy,
"https://vsco.co/feed"
);
}catch(Exception $error){
throw new Exception("Failed to fetch feed page");
}
preg_match(
'/"tkn":"([A-z0-9]+)"/',
$html,
$bearer
);
if(!isset($bearer[1])){
throw new Exception("Failed to grep bearer token");
}
$data = [
"pagination" => [
"query" => $search,
"page" => 0,
"size" => 100
],
"bearer" => $bearer[1]
];
}
try{
$json =
$this->get(
$proxy,
"https://vsco.co/api/2.0/search/images",
$data["pagination"],
$data["bearer"]
);
}catch(Exception $error){
throw new Exception("Failed to fetch JSON");
}
$json = json_decode($json, true);
if($json === null){
throw new Exception("Failed to decode JSON");
}
$out = [
"status" => "ok",
"npt" => null,
"image" => []
];
if(!isset($json["results"])){
throw new Exception("Failed to access results object");
}
foreach($json["results"] as $image){
$image_domain = parse_url("https://" . $image["responsive_url"], PHP_URL_HOST);
$thumbnail = explode($image_domain, $image["responsive_url"], 2)[1];
if(substr($thumbnail, 0, 3) != "/1/"){
$thumbnail =
preg_replace(
'/^\/[^\/]+/',
"",
$thumbnail
);
}
$thumbnail = "https://img.vsco.co/cdn-cgi/image/width=480,height=360" . $thumbnail;
$size =
$this->image_ratio(
(int)$image["dimensions"]["width"],
(int)$image["dimensions"]["height"]
);
$out["image"][] = [
"title" => $image["description"],
"source" => [
[
"url" => "https://" . $image["responsive_url"],
"width" => (int)$image["dimensions"]["width"],
"height" => (int)$image["dimensions"]["height"]
],
[
"url" => $thumbnail,
"width" => $size[0],
"height" => $size[1]
]
],
"url" => "https://" . $image["grid"]["domain"] . "/media/" . $image["imageId"]
];
}
// get NPT
$max_page = ceil($json["total"] / 100);
$data["pagination"]["page"]++;
if($max_page > $data["pagination"]["page"]){
$out["npt"] =
$this->backend->store(
json_encode($data),
"images",
$proxy
);
}
return $out;
}
private function image_ratio($width, $height){
$ratio = [
480 / $width,
360 / $height
];
if($ratio[0] < $ratio[1]){
$ratio = $ratio[0];
}else{
$ratio = $ratio[1];
}
return [
floor($width * $ratio),
floor($height * $ratio)
];
}
}

246
scraper/wiby.php Normal file
View file

@ -0,0 +1,246 @@
<?php
class wiby{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("wiby");
}
public function getfilters($page){
if($page != "web"){
return [];
}
return [
"nsfw" => [
"display" => "NSFW",
"option" => [
"yes" => "Yes",
"no" => "No"
]
],
"date" => [
"display" => "Time posted",
"option" => [
"any" => "Any time",
"day" => "Past day",
"week" => "Past week",
"month" => "Past month",
"year" => "Past year",
]
]
];
}
private function get($proxy, $url, $get = [], $nsfw){
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"Cookie: ws={$nsfw}",
"DNT: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1"]
);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function web($get){
if($get["npt"]){
[$q, $proxy] = $this->backend->get($get["npt"], "web");
$q = json_decode($q, true);
$nsfw = $q["nsfw"];
unset($q["nsfw"]);
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
$date = $get["date"];
$nsfw = $get["nsfw"] == "yes" ? "0" : "1";
$search =
str_replace(
[
"!g",
"!gi",
"!gv",
"!gm",
"!b",
"!bi",
"!bv",
"!bm",
"!td",
"!tw",
"!tm",
"!ty",
"&g",
"&gi",
"&gv",
"&gm",
"&b",
"&bi",
"&bv",
"&bm",
"&td",
"&tw",
"&tm",
"&ty",
],
"",
$search
);
switch($date){
case "day": $search = "!td " . $search; break;
case "week": $search = "!tw " . $search; break;
case "month": $search = "!tm " . $search; break;
case "year": $search = "!ty " . $search; break;
}
$q = [
"q" => $search
];
}
try{
$html = $this->get(
$proxy,
"https://wiby.me/",
$q,
$nsfw
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
preg_match(
'/<p class="pin"><blockquote>(?:<\/p>)?<br><a class="more" href="\/\?q=[^"]+&p=([0-9]+)">Find more\.\.\.<\/a><\/blockquote>/',
$html,
$nextpage
);
if(count($nextpage) === 0){
$nextpage = null;
}else{
$nextpage =
$this->backend->store(
json_encode([
"q" => $q["q"],
"p" => (int)$nextpage[1],
"nsfw" => $nsfw
]),
"web",
$proxy
);
}
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => $nextpage,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
preg_match_all(
'/<blockquote>[\s]*<a .* href="(.*)">(.*)<\/a>.*<p>(.*)<\/p>[\s]*<\/blockquote>/Ui',
$html,
$links
);
for($i=0; $i<count($links[0]); $i++){
$out["web"][] = [
"title" => $this->unescapehtml(trim($links[2][$i])),
"description" => $this->unescapehtml(trim(strip_tags($links[3][$i]), ".\n\r ")),
"url" => trim($links[1][$i]),
"date" => null,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
}
return $out;
}
private function unescapehtml($str){
return html_entity_decode(
str_replace(
[
"<br>",
"<br/>",
"</br>",
"<BR>",
"<BR/>",
"</BR>",
],
"\n",
$str
),
ENT_QUOTES | ENT_XML1, 'UTF-8'
);
}
}

1170
scraper/yandex.php Normal file

File diff suppressed because it is too large Load diff

741
scraper/yep.php Normal file
View file

@ -0,0 +1,741 @@
<?php
class yep{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("yep");
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
}
public function getfilters($page){
return [
"country" => [
"display" => "Country",
"option" => [
"all" => "All regions",
"af" => "Afghanistan",
"al" => "Albania",
"dz" => "Algeria",
"as" => "American Samoa",
"ad" => "Andorra",
"ao" => "Angola",
"ai" => "Anguilla",
"ag" => "Antigua and Barbuda",
"ar" => "Argentina",
"am" => "Armenia",
"aw" => "Aruba",
"au" => "Australia",
"at" => "Austria",
"az" => "Azerbaijan",
"bs" => "Bahamas",
"bh" => "Bahrain",
"bd" => "Bangladesh",
"bb" => "Barbados",
"by" => "Belarus",
"be" => "Belgium",
"bz" => "Belize",
"bj" => "Benin",
"bt" => "Bhutan",
"bo" => "Bolivia",
"ba" => "Bosnia and Herzegovina",
"bw" => "Botswana",
"br" => "Brazil",
"bn" => "Brunei Darussalam",
"bg" => "Bulgaria",
"bf" => "Burkina Faso",
"bi" => "Burundi",
"cv" => "Cabo Verde",
"kh" => "Cambodia",
"cm" => "Cameroon",
"ca" => "Canada",
"ky" => "Cayman Islands",
"cf" => "Central African Republic",
"td" => "Chad",
"cl" => "Chile",
"cn" => "China",
"co" => "Colombia",
"cg" => "Congo",
"cd" => "Congo, Democratic Republic",
"ck" => "Cook Islands",
"cr" => "Costa Rica",
"hr" => "Croatia",
"cu" => "Cuba",
"cy" => "Cyprus",
"cz" => "Czechia",
"ci" => "Côte d'Ivoire",
"dk" => "Denmark",
"dj" => "Djibouti",
"dm" => "Dominica",
"do" => "Dominican Republic",
"ec" => "Ecuador",
"eg" => "Egypt",
"sv" => "El Salvador",
"gq" => "Equatorial Guinea",
"ee" => "Estonia",
"et" => "Ethiopia",
"fo" => "Faroe Islands",
"fj" => "Fiji",
"fi" => "Finland",
"fr" => "France",
"gf" => "French Guiana",
"pf" => "French Polynesia",
"ga" => "Gabon",
"gm" => "Gambia",
"ge" => "Georgia",
"de" => "Germany",
"gh" => "Ghana",
"gi" => "Gibraltar",
"gr" => "Greece",
"gl" => "Greenland",
"gd" => "Grenada",
"gp" => "Guadeloupe",
"gu" => "Guam",
"gt" => "Guatemala",
"gg" => "Guernsey",
"gn" => "Guinea",
"gy" => "Guyana",
"ht" => "Haiti",
"hn" => "Honduras",
"hk" => "Hong Kong",
"hu" => "Hungary",
"is" => "Iceland",
"in" => "India",
"id" => "Indonesia",
"iq" => "Iraq",
"ie" => "Ireland",
"im" => "Isle of Man",
"il" => "Israel",
"it" => "Italy",
"jm" => "Jamaica",
"jp" => "Japan",
"je" => "Jersey",
"jo" => "Jordan",
"kz" => "Kazakhstan",
"ke" => "Kenya",
"ki" => "Kiribati",
"kw" => "Kuwait",
"kg" => "Kyrgyzstan",
"la" => "Lao People's Democratic Republic",
"lv" => "Latvia",
"lb" => "Lebanon",
"ls" => "Lesotho",
"ly" => "Libya",
"li" => "Liechtenstein",
"lt" => "Lithuania",
"lu" => "Luxembourg",
"mk" => "Macedonia",
"mg" => "Madagascar",
"mw" => "Malawi",
"my" => "Malaysia",
"mv" => "Maldives",
"ml" => "Mali",
"mt" => "Malta",
"mq" => "Martinique",
"mr" => "Mauritania",
"mu" => "Mauritius",
"yt" => "Mayotte",
"mx" => "Mexico",
"fm" => "Micronesia, Federated States of",
"md" => "Moldova",
"mc" => "Monaco",
"mn" => "Mongolia",
"me" => "Montenegro",
"ms" => "Montserrat",
"ma" => "Morocco",
"mz" => "Mozambique",
"mm" => "Myanmar",
"na" => "Namibia",
"nr" => "Nauru",
"np" => "Nepal",
"nl" => "Netherlands",
"nc" => "New Caledonia",
"nz" => "New Zealand",
"ni" => "Nicaragua",
"ne" => "Niger",
"ng" => "Nigeria",
"nu" => "Niue",
"no" => "Norway",
"om" => "Oman",
"pk" => "Pakistan",
"ps" => "Palestine, State of",
"pa" => "Panama",
"pg" => "Papua New Guinea",
"py" => "Paraguay",
"pe" => "Peru",
"ph" => "Philippines",
"pn" => "Pitcairn",
"pl" => "Poland",
"pt" => "Portugal",
"pr" => "Puerto Rico",
"qa" => "Qatar",
"ro" => "Romania",
"ru" => "Russian Federation",
"rw" => "Rwanda",
"re" => "Réunion",
"sh" => "Saint Helena",
"kn" => "Saint Kitts and Nevis",
"lc" => "Saint Lucia",
"vc" => "Saint Vincent and the Grenadines",
"ws" => "Samoa",
"sm" => "San Marino",
"st" => "Sao Tome and Principe",
"sa" => "Saudi Arabia",
"sn" => "Senegal",
"rs" => "Serbia",
"sc" => "Seychelles",
"sl" => "Sierra Leone",
"sg" => "Singapore",
"sk" => "Slovakia",
"si" => "Slovenia",
"sb" => "Solomon Islands",
"so" => "Somalia",
"kr" => "Sourth Korea",
"za" => "South Africa",
"es" => "Spain",
"lk" => "Sri Lanka",
"sr" => "Suriname",
"se" => "Sweden",
"ch" => "Switzerland",
"tw" => "Taiwan",
"tj" => "Tajikistan",
"tz" => "Tanzania",
"th" => "Thailand",
"tl" => "Timor-Leste",
"tg" => "Togo",
"tk" => "Tokelau",
"to" => "Tonga",
"tt" => "Trinidad and Tobago",
"tn" => "Tunisia",
"tr" => "Turkey",
"tm" => "Turkmenistan",
"ug" => "Uganda",
"ua" => "Ukraine",
"ae" => "United Arab Emirates",
"gb" => "United Kingdom",
"us" => "United States",
"uy" => "Uruguay",
"uz" => "Uzbekistan",
"vu" => "Vanuatu",
"ve" => "Venezuela",
"vn" => "Vietnam",
"vg" => "Virgin Islands, British",
"vi" => "Virgin Islands, U.S.",
"ye" => "Yemen",
"zm" => "Zambia",
"zw" => "Zimbabwe"
]
],
"nsfw" => [
"display" => "NSFW",
"option" => [
"yes" => "Yes",
"maybe" => "Maybe",
"no" => "No"
]
]
];
}
private function get($proxy, $url, $get = []){
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
// use http2
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
// set ciphers
curl_setopt(
$curlproc,
CURLOPT_SSL_CIPHER_LIST,
"aes_128_gcm_sha_256,chacha20_poly1305_sha_256,aes_256_gcm_sha_384,ecdhe_ecdsa_aes_128_gcm_sha_256,ecdhe_rsa_aes_128_gcm_sha_256,ecdhe_ecdsa_chacha20_poly1305_sha_256,ecdhe_rsa_chacha20_poly1305_sha_256,ecdhe_ecdsa_aes_256_gcm_sha_384,ecdhe_rsa_aes_256_gcm_sha_384,ecdhe_ecdsa_aes_256_sha,ecdhe_ecdsa_aes_128_sha,ecdhe_rsa_aes_128_sha,ecdhe_rsa_aes_256_sha,rsa_aes_128_gcm_sha_256,rsa_aes_256_gcm_sha_384,rsa_aes_128_sha,rsa_aes_256_sha"
);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: */*",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip, deflate, br, zstd",
"Referer: https://yep.com/",
"Origin: https://yep.com",
"DNT: 1",
"Connection: keep-alive",
"Sec-Fetch-Dest: empty",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-site",
"Priority: u=4",
"TE: trailers"]
);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return $data;
}
public function web($get){
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$country = $get["country"];
$nsfw = $get["nsfw"];
switch($nsfw){
case "yes": $nsfw = "off"; break;
case "maybe": $nsfw = "moderate"; break;
case "no": $nsfw = "strict"; break;
}
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
try{
// https://api.yep.com/fs/2/search?client=web&gl=CA&no_correct=false&q=undefined+variable+javascript&safeSearch=off&type=web
$json =
$this->get(
$this->backend->get_ip(),
"https://api.yep.com/fs/2/search",
[
"client" => "web",
"gl" => $country == "all" ? $country : strtoupper($country),
"limit" => "99999",
"no_correct" => "false",
"q" => $search,
"safeSearch" => $nsfw,
"type" => "web"
]
);
}catch(Exception $error){
throw new Exception("Failed to fetch JSON");
}
$this->detect_cf($json);
$json = json_decode($json, true);
//$json = json_decode(file_get_contents("scraper/yep.json"), true);
if($json === null){
throw new Exception("Failed to decode JSON");
}
if(isset($json[1]["correction"])){
$out["spelling"] = [
"type" => "not_many",
"using" => $search,
"correction" => $json[1]["correction"][1]
];
}
if(isset($json[1]["results"])){
foreach($json[1]["results"] as $item){
switch(strtolower($item["type"])){
case "organic":
$sublinks = [];
if(isset($item["sitelinks"]["full"])){
foreach($item["sitelinks"]["full"] as $link){
$sublinks[] = [
"title" => $link["title"],
"date" => null,
"description" =>
$this->titledots(
strip_tags(
html_entity_decode(
$link["snippet"]
)
)
),
"url" => $link["url"]
];
}
}
$out["web"][] = [
"title" => $item["title"],
"description" =>
$this->titledots(
strip_tags(
html_entity_decode(
$item["snippet"]
)
)
),
"url" => $item["url"],
"date" => strtotime($item["first_seen"]),
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => $sublinks,
"table" => []
];
break;
}
}
}
if(isset($json[1]["featured_news"])){
foreach($json[1]["featured_news"] as $news){
$out["news"][] = [
"title" => $news["title"],
"description" =>
$this->titledots(
strip_tags(
html_entity_decode(
$news["snippet"]
)
)
),
"date" => strtotime($news["first_seen"]),
"thumb" =>
isset($news["img"]) ?
[
"url" => $this->unshiturl($news["img"]),
"ratio" => "16:9"
] :
[
"url" => null,
"ratio" => null
],
"url" => $news["url"]
];
}
}
if(isset($json[1]["featured_images"])){
foreach($json[1]["featured_images"] as $image){
if(
$image["width"] !== 0 &&
$image["height"] !== 0
){
$thumb_width = $image["width"] >= 260 ? 260 : $image["width"];
$thumb_height = ceil($image["height"] * ($thumb_width / $image["width"]));
$width = $image["width"];
$height = $image["height"];
}else{
$thumb_width = null;
$thumb_height = null;
$width = null;
$height = null;
}
$out["image"][] = [
"title" => $image["title"],
"source" => [
[
"url" => $image["image_id"],
"width" => $width,
"height" => $height
],
[
"url" => $image["src"],
"width" => $thumb_width,
"height" => $thumb_height
]
],
"url" => $image["host_page"]
];
}
}
return $out;
}
public function image($get){
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$country = $get["country"];
$nsfw = $get["nsfw"];
switch($nsfw){
case "yes": $nsfw = "off"; break;
case "maybe": $nsfw = "moderate"; break;
case "no": $nsfw = "strict"; break;
}
$out = [
"status" => "ok",
"npt" => null,
"image" => []
];
try{
$json =
$this->get(
$this->backend->get_ip(), // no nextpage!
"https://api.yep.com/fs/2/search",
[
"client" => "web",
"gl" => $country == "all" ? $country : strtoupper($country),
"no_correct" => "false",
"q" => $search,
"safeSearch" => $nsfw,
"type" => "images"
]
);
}catch(Exception $error){
throw new Exception("Failed to fetch JSON");
}
$this->detect_cf($json);
$json = json_decode($json, true);
if($json === null){
throw new Exception("Failed to decode JSON");
}
if(isset($json[1]["results"])){
foreach($json[1]["results"] as $item){
if(
$item["width"] !== 0 &&
$item["height"] !== 0
){
$thumb_width = $item["width"] >= 260 ? 260 : $item["width"];
$thumb_height = ceil($item["height"] * ($thumb_width / $item["width"]));
$width = $item["width"];
$height = $item["height"];
}else{
$thumb_width = null;
$thumb_height = null;
$width = null;
$height = null;
}
$out["image"][] = [
"title" => $item["title"],
"source" => [
[
"url" => $item["image_id"],
"width" => $width,
"height" => $height
],
[
"url" => $item["src"],
"width" => $thumb_width,
"height" => $thumb_height
]
],
"url" => $item["host_page"]
];
}
}
return $out;
}
public function news($get){
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$country = $get["country"];
$nsfw = $get["nsfw"];
switch($nsfw){
case "yes": $nsfw = "off"; break;
case "maybe": $nsfw = "moderate"; break;
case "no": $nsfw = "strict"; break;
}
$out = [
"status" => "ok",
"npt" => null,
"news" => []
];
try{
// https://api.yep.com/fs/2/search?client=web&gl=CA&no_correct=false&q=undefined+variable+javascript&safeSearch=off&type=web
$json =
$this->get(
$this->backend->get_ip(),
"https://api.yep.com/fs/2/search",
[
"client" => "web",
"gl" => $country == "all" ? $country : strtoupper($country),
"limit" => "99999",
"no_correct" => "false",
"q" => $search,
"safeSearch" => $nsfw,
"type" => "news"
]
);
}catch(Exception $error){
throw new Exception("Failed to fetch JSON");
}
$this->detect_cf($json);
$json = json_decode($json, true);
//$json = json_decode(file_get_contents("scraper/yep.json"), true);
if($json === null){
throw new Exception("Failed to decode JSON");
}
if(isset($json[1]["results"])){
foreach($json[1]["results"] as $item){
$out["news"][] = [
"title" => $item["title"],
"author" => null,
"description" =>
$this->titledots(
strip_tags(
html_entity_decode(
$item["snippet"]
)
)
),
"date" => strtotime($item["first_seen"]),
"thumb" =>
isset($item["img"]) ?
[
"url" => $this->unshiturl($item["img"]),
"ratio" => "16:9"
] :
[
"url" => null,
"ratio" => null
],
"url" => $item["url"]
];
}
}
return $out;
}
private function detect_cf($payload){
// detect cloudflare page
$this->fuckhtml->load($payload);
if(
count(
$this->fuckhtml
->getElementsByClassName(
"cf-wrapper",
"div"
)
) !== 0
){
throw new Exception("Blocked by Cloudflare. Please follow curl-impersonate installation instructions");
}
}
private function titledots($title){
$substr = substr($title, -4);
if(
strpos($substr, "...") !== false ||
strpos($substr, "") !== false
){
return trim(substr($title, 0, -4));
}
return trim($title);
}
private function unshiturl($url){
$newurl = parse_url($url, PHP_URL_QUERY);
parse_str($newurl, $newurl);
if(isset($newurl["url"])){
return $newurl["url"];
}
return $url;
}
}

1727
scraper/yt.php Normal file

File diff suppressed because it is too large Load diff