From c36f73caa246293310fa2e9f03652bcc4870c6cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=A8=E5=BF=97?= Date: Tue, 20 Jan 2026 18:30:18 +0800 Subject: [PATCH] update --- app/controller/Crawler.php | 207 ++++++++------------------------- app/service/CrawlerService.php | 204 +++++++++++++++++++++++--------- 2 files changed, 196 insertions(+), 215 deletions(-) diff --git a/app/controller/Crawler.php b/app/controller/Crawler.php index f40ffdf..6978354 100644 --- a/app/controller/Crawler.php +++ b/app/controller/Crawler.php @@ -39,29 +39,24 @@ class Crawler extends BaseController ]); } - if (empty($cookiesParam)) { + $service = new CrawlerService(); + try { + $cookies = $service->parseCookies($cookiesParam); + } catch (\Exception $e) { return json([ 'code' => 0, - 'msg' => '请填写Cookie数据', + 'msg' => $e->getMessage(), ]); } - - // 解析JSON格式的cookies - $cookies = is_string($cookiesParam) ? json_decode($cookiesParam, true) : $cookiesParam; - if (json_last_error() !== JSON_ERROR_NONE || empty($cookies)) { - return json([ - 'code' => 0, - 'msg' => 'Cookie数据格式错误,请检查JSON格式', - ]); - } - - $cookieString = $this->buildCookieString($cookies); + + $cookieString = $service->buildCookieString($cookies); // 构建URL获取HTML - 使用GET请求 - $url = "http://gzrsks.oumakspt.com:62/tyzpwb/stuchooseexam/selectPosition.htm?examstupid=1015&userid={$userid}&bmid={$bmid}&examid={$examid}&aa={$aa}"; + $baseUrl = $service->getBaseUrl(); + $url = "{$baseUrl}/stuchooseexam/selectPosition.htm?examstupid=1015&userid={$userid}&bmid={$bmid}&examid={$examid}&aa={$aa}"; // 构建Referer URL(与浏览器实际访问一致) - $refererUrl = "http://gzrsks.oumakspt.com:62/tyzpwb/stuchooseexam/input.htm"; + $refererUrl = "{$baseUrl}/stuchooseexam/input.htm"; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); @@ -72,17 +67,7 @@ class Crawler extends BaseController curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); curl_setopt($ch, CURLOPT_ENCODING, ''); // 自动处理gzip编码 - curl_setopt($ch, CURLOPT_HTTPHEADER, [ - 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', - 'Accept-Language: zh-CN,zh;q=0.9', - 'Accept-Encoding: gzip, deflate', - 'Cache-Control: no-cache', - 'Connection: keep-alive', - 'Cookie: ' . $cookieString, - 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36', - 'Referer: ' . $refererUrl, - 'Upgrade-Insecure-Requests: 1', - ]); + curl_setopt($ch, CURLOPT_HTTPHEADER, $service->buildHtmlHeaders($cookieString, $refererUrl)); $html = curl_exec($ch); $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); @@ -103,7 +88,6 @@ class Crawler extends BaseController ]); } - $service = new CrawlerService(); $options = $service->extractDsdmOptions($html); return json([ @@ -120,53 +104,6 @@ class Crawler extends BaseController } } - /** - * 构建Cookie字符串 - * @param array $cookies Cookie数组 - * @return string - */ - private function buildCookieString(array|string $cookies): string - { - // 如果直接传入原始Cookie字符串,优先使用 - if (is_string($cookies)) { - return trim($cookies); - } - - // 只保留用户填写的Cookie,支持同名键多值(如双JSESSIONID) - $cookieData = $cookies['请求 Cookie'] ?? $cookies; - $normalized = []; - - foreach ($cookieData as $key => $value) { - $k = trim((string)$key); - if ($k === '') { - continue; - } - $vals = is_array($value) ? $value : [$value]; - foreach ($vals as $v) { - $v = trim((string)$v); - if ($v === '') { - continue; - } - $normalized[$k][] = $v; - } - } - - // 将 JSESSIONID 放在最前(如果存在),其余按键名顺序,多值全部保留 - $parts = []; - if (isset($normalized['JSESSIONID'])) { - foreach ($normalized['JSESSIONID'] as $v) { - $parts[] = 'JSESSIONID=' . $v; - } - unset($normalized['JSESSIONID']); - } - foreach ($normalized as $k => $vArr) { - foreach ($vArr as $v) { - $parts[] = $k . '=' . $v; - } - } - - return implode('; ', $parts); - } /** * 获取职位代码列表 @@ -202,23 +139,16 @@ class Crawler extends BaseController ]); } - if (empty($cookiesParam)) { - return json([ - 'code' => 0, - 'msg' => '请填写Cookie数据', - ]); - } - - // 解析JSON格式的cookies - $cookies = is_string($cookiesParam) ? json_decode($cookiesParam, true) : $cookiesParam; - if (json_last_error() !== JSON_ERROR_NONE || empty($cookies)) { - return json([ - 'code' => 0, - 'msg' => 'Cookie数据格式错误,请检查JSON格式', - ]); - } - $service = new CrawlerService(); + try { + $cookies = $service->parseCookies($cookiesParam); + } catch (\Exception $e) { + return json([ + 'code' => 0, + 'msg' => $e->getMessage(), + ]); + } + $treeData = $service->getPositionTree($dsdm, $examid, $bmid, $userid, (string)$aa, $cookies); // 提取所有CODE作为zwdm @@ -268,22 +198,16 @@ class Crawler extends BaseController ]); } - if (empty($cookiesParam)) { - return json([ - 'code' => 0, - 'msg' => '请填写Cookie数据', - ]); - } - - $cookies = is_string($cookiesParam) ? json_decode($cookiesParam, true) : $cookiesParam; - if (json_last_error() !== JSON_ERROR_NONE || empty($cookies)) { - return json([ - 'code' => 0, - 'msg' => 'Cookie数据格式错误,请检查JSON格式', - ]); - } - $service = new CrawlerService(); + try { + $cookies = $service->parseCookies($cookiesParam); + } catch (\Exception $e) { + return json([ + 'code' => 0, + 'msg' => $e->getMessage(), + ]); + } + $treeData = $service->getPositionTree($dsdm, $examid, $bmid, $userid, (string)$aa, $cookies); $zwdmList = []; @@ -339,48 +263,20 @@ class Crawler extends BaseController ]); } - if (empty($cookiesParam)) { - return json([ - 'code' => 0, - 'msg' => '请填写Cookie数据', - ]); - } - - // 解析JSON格式的cookies - $cookies = is_string($cookiesParam) ? json_decode($cookiesParam, true) : $cookiesParam; - if (json_last_error() !== JSON_ERROR_NONE || empty($cookies)) { - return json([ - 'code' => 0, - 'msg' => 'Cookie数据格式错误,请检查JSON格式', - ]); - } - $service = new CrawlerService(); + try { + $cookies = $service->parseCookies($cookiesParam); + } catch (\Exception $e) { + return json([ + 'code' => 0, + 'msg' => $e->getMessage(), + ]); + } + $info = $service->getPositionInfo($zwdm, $examid, $cookies); if (!empty($info)) { - // 处理单条数据或数组数据 - if (isset($info[0])) { - $item = $info[0]; - } else { - $item = $info; - } - - // 计算竞争比(格式:招聘人数:审核通过人数) - $zprs = isset($item['zprs']) ? intval($item['zprs']) : 0; - $bkrs = isset($item['bkrs']) ? intval($item['bkrs']) : 0; - $competitionRatio = $zprs > 0 && $bkrs > 0 ? $zprs . ':' . $bkrs : ($zprs > 0 ? $zprs . ':0' : '0:0'); - - $result = [ - 'sbmc' => $item['sbmc'] ?? '', // 省份 - 'dsmc' => $item['dsmc'] ?? '', // 地区 - 'zpdwmc' => $item['zpdwmc'] ?? '', // 招聘单位/用人司局 - 'zwmc' => $item['zwmc'] ?? '', // 职位名称 - 'zwdm' => $item['zwdm'] ?? $zwdm, // 职位代码 - 'zprs' => $zprs, // 招聘人数 - 'bkrs' => $bkrs, // 审核通过人数 - 'competition_ratio' => $competitionRatio, // 竞争比(格式:招聘人数:审核通过人数) - ]; + $result = $service->formatPositionInfo($info, $zwdm); return json([ 'code' => 1, @@ -428,23 +324,16 @@ class Crawler extends BaseController ]); } - if (empty($cookiesParam)) { - return json([ - 'code' => 0, - 'msg' => '请填写Cookie数据', - ]); - } - - // 解析JSON格式的cookies - $cookies = is_string($cookiesParam) ? json_decode($cookiesParam, true) : $cookiesParam; - if (json_last_error() !== JSON_ERROR_NONE || empty($cookies)) { - return json([ - 'code' => 0, - 'msg' => 'Cookie数据格式错误,请检查JSON格式', - ]); - } - $service = new CrawlerService(); + try { + $cookies = $service->parseCookies($cookiesParam); + } catch (\Exception $e) { + return json([ + 'code' => 0, + 'msg' => $e->getMessage(), + ]); + } + $results = $service->batchGetPositionInfo($zwdmList, $examid, $cookies); return json([ diff --git a/app/service/CrawlerService.php b/app/service/CrawlerService.php index 4d76682..c83aa88 100644 --- a/app/service/CrawlerService.php +++ b/app/service/CrawlerService.php @@ -10,9 +10,91 @@ namespace app\service; class CrawlerService { /** - * 基础URL + * 基础URL(域名和端口) */ - private $baseUrl = 'http://gzrsks.oumakspt.com:62/tyzpwb'; + private const BASE_URL = 'http://gzrsks.oumakspt.com:62'; + + /** + * 应用路径 + */ + private const APP_PATH = '/tyzpwb'; + + /** + * 获取完整基础URL(包含应用路径) + * @return string + */ + public function getBaseUrl(): string + { + return self::BASE_URL . self::APP_PATH; + } + + /** + * 构建HTTP请求头(用于HTML页面请求) + * @param string $cookieString Cookie字符串 + * @param string $referer Referer URL + * @return array + */ + public function buildHtmlHeaders(string $cookieString, string $referer): array + { + return [ + 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'Accept-Language: zh-CN,zh;q=0.9', + 'Accept-Encoding: gzip, deflate', + 'Cache-Control: no-cache', + 'Connection: keep-alive', + 'Cookie: ' . $cookieString, + 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36', + 'Referer: ' . $referer, + 'Upgrade-Insecure-Requests: 1', + ]; + } + + /** + * 构建HTTP请求头(用于AJAX请求) + * @param string $cookieString Cookie字符串 + * @param string $referer Referer URL + * @param string $browserType 浏览器类型:'chrome' 或 'firefox',默认为 'chrome' + * @param string|null $origin Origin URL(可选,Firefox时自动使用BASE_URL) + * @param bool $withCharset 是否在Content-Type中包含charset=UTF-8,默认为false + * @return array + */ + public function buildAjaxHeaders(string $cookieString, string $referer, string $browserType = 'chrome', ?string $origin = null, bool $withCharset = false): array + { + $isFirefox = $browserType === 'firefox'; + + $headers = [ + 'Accept: text/plain, */*', + 'Accept-Language: zh-CN,zh;q=0.9,zh-TW;q=0.8,zh-HK;q=0.7,en-US;q=0.6,en;q=0.5', + 'Accept-Encoding: gzip, deflate', + 'Cache-Control: no-cache', + 'Connection: keep-alive', + 'Content-Type: application/x-www-form-urlencoded' . ($withCharset ? '; charset=UTF-8' : ''), + 'Cookie: ' . $cookieString, + 'Referer: ' . $referer, + 'X-Requested-With: XMLHttpRequest', + ]; + + // Firefox特有:添加Pragma头 + if ($isFirefox) { + $headers[] = 'Pragma: no-cache'; + } + + // User-Agent + if ($isFirefox) { + $headers[] = 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:147.0) Gecko/20100101 Firefox/147.0'; + } else { + $headers[] = 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'; + } + + // Origin + if ($isFirefox) { + $headers[] = 'Origin: ' . self::BASE_URL; + } elseif ($origin !== null) { + $headers[] = 'Origin: ' . $origin; + } + + return $headers; + } /** * 获取职位树数据 @@ -26,12 +108,13 @@ class CrawlerService */ public function getPositionTree(string $dsdm, string $examid, string $bmid, string $userid, string $aa, array $cookies): array { - $url = $this->baseUrl . '/tree/getPositionTree.htm'; + $baseUrl = $this->getBaseUrl(); + $url = $baseUrl . '/tree/getPositionTree.htm'; $cookieString = $this->buildCookieString($cookies); // 构建Referer URL(包含完整参数) - $refererUrl = "http://gzrsks.oumakspt.com:62/tyzpwb/stuchooseexam/selectPosition.htm?examstupid=1015&userid={$userid}&bmid={$bmid}&examid={$examid}&aa={$aa}"; + $refererUrl = $baseUrl . "/stuchooseexam/selectPosition.htm?examstupid=1015&userid={$userid}&bmid={$bmid}&examid={$examid}&aa={$aa}"; // 构建POST参数 $postData = [ @@ -48,20 +131,7 @@ class CrawlerService curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); curl_setopt($ch, CURLOPT_ENCODING, ''); // 自动处理gzip编码 - curl_setopt($ch, CURLOPT_HTTPHEADER, [ - 'Accept: text/plain, */*', - 'Accept-Language: zh-CN,zh;q=0.9,zh-TW;q=0.8,zh-HK;q=0.7,en-US;q=0.6,en;q=0.5', - 'Accept-Encoding: gzip, deflate', - 'Pragma: no-cache', - 'Cache-Control: no-cache', - 'Connection: keep-alive', - 'Content-Type: application/x-www-form-urlencoded', - 'Cookie: ' . $cookieString, - 'Origin: http://gzrsks.oumakspt.com:62', - 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:147.0) Gecko/20100101 Firefox/147.0', - 'Referer: ' . $refererUrl, - 'X-Requested-With: XMLHttpRequest', - ]); + curl_setopt($ch, CURLOPT_HTTPHEADER, $this->buildAjaxHeaders($cookieString, $refererUrl, 'firefox')); curl_setopt($ch, CURLOPT_POST, true); curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($postData)); curl_setopt($ch, CURLOPT_TIMEOUT, 30); @@ -98,10 +168,14 @@ class CrawlerService */ public function getPositionInfo(string $zwdm, string $examid, array $cookies): array { - $url = $this->baseUrl . '/stuchooseexam/getPositionInfo.htm'; + $baseUrl = $this->getBaseUrl(); + $url = $baseUrl . '/stuchooseexam/getPositionInfo.htm'; $cookieString = $this->buildCookieString($cookies); + $referer = $baseUrl . '/stuchooseexam/selectPosition.htm'; + $origin = self::BASE_URL; + $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); @@ -109,19 +183,7 @@ class CrawlerService curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); curl_setopt($ch, CURLOPT_ENCODING, ''); // 自动处理gzip编码 - curl_setopt($ch, CURLOPT_HTTPHEADER, [ - 'Accept: text/plain, */*', - 'Accept-Language: zh-CN,zh;q=0.9,zh-TW;q=0.8,zh-HK;q=0.7,en-US;q=0.6,en;q=0.5', - 'Accept-Encoding: gzip, deflate', - 'Cache-Control: no-cache', - 'Connection: keep-alive', - 'Content-Type: application/x-www-form-urlencoded; charset=UTF-8', - 'Cookie: ' . $cookieString, - 'Origin: http://gzrsks.oumakspt.com:62', - 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36', - 'Referer: http://gzrsks.oumakspt.com:62/tyzpwb/stuchooseexam/selectPosition.htm', - 'X-Requested-With: XMLHttpRequest', - ]); + curl_setopt($ch, CURLOPT_HTTPHEADER, $this->buildAjaxHeaders($cookieString, $referer, 'chrome', $origin, true)); curl_setopt($ch, CURLOPT_POST, true); curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query([ 'zwdm' => $zwdm, @@ -168,28 +230,7 @@ class CrawlerService $info = $this->getPositionInfo($zwdm, $examid, $cookies); if (!empty($info)) { - // 处理单条数据或数组数据 - if (isset($info[0])) { - $item = $info[0]; - } else { - $item = $info; - } - - // 计算竞争比(格式:招聘人数:审核通过人数) - $zprs = isset($item['zprs']) ? intval($item['zprs']) : 0; - $bkrs = isset($item['bkrs']) ? intval($item['bkrs']) : 0; - $competitionRatio = $zprs > 0 && $bkrs > 0 ? $zprs . ':' . $bkrs : ($zprs > 0 ? $zprs . ':0' : '0:0'); - - $results[] = [ - 'sbmc' => $item['sbmc'] ?? '', // 省份 - 'dsmc' => $item['dsmc'] ?? '', // 地区 - 'zpdwmc' => $item['zpdwmc'] ?? '', // 招聘单位/用人司局 - 'zwmc' => $item['zwmc'] ?? '', // 职位名称 - 'zwdm' => $item['zwdm'] ?? $zwdm, // 职位代码 - 'zprs' => $zprs, // 招聘人数 - 'bkrs' => $bkrs, // 审核通过人数 - 'competition_ratio' => $competitionRatio, // 竞争比(格式:招聘人数:审核通过人数) - ]; + $results[] = $this->formatPositionInfo($info, $zwdm); } // 避免请求过快,添加小延迟 @@ -207,12 +248,63 @@ class CrawlerService return $results; } + /** + * 解析并验证Cookie数据 + * @param mixed $cookiesParam Cookie参数(可能是字符串或数组) + * @return array + * @throws \Exception + */ + public function parseCookies($cookiesParam): array + { + if (empty($cookiesParam)) { + throw new \Exception('请填写Cookie数据'); + } + + // 解析JSON格式的cookies + $cookies = is_string($cookiesParam) ? json_decode($cookiesParam, true) : $cookiesParam; + if (json_last_error() !== JSON_ERROR_NONE || empty($cookies)) { + throw new \Exception('Cookie数据格式错误,请检查JSON格式'); + } + + return $cookies; + } + + /** + * 格式化职位信息(包含竞争比计算) + * @param array $item 原始职位数据 + * @param string $zwdm 职位代码(备用) + * @return array + */ + public function formatPositionInfo(array $item, string $zwdm = ''): array + { + // 处理单条数据或数组数据 + if (isset($item[0])) { + $item = $item[0]; + } + + // 计算竞争比(格式:招聘人数:审核通过人数) + $zprs = isset($item['zprs']) ? intval($item['zprs']) : 0; + $bkrs = isset($item['bkrs']) ? intval($item['bkrs']) : 0; + $competitionRatio = $zprs > 0 && $bkrs > 0 ? $zprs . ':' . $bkrs : ($zprs > 0 ? $zprs . ':0' : '0:0'); + + return [ + 'sbmc' => $item['sbmc'] ?? '', // 省份 + 'dsmc' => $item['dsmc'] ?? '', // 地区 + 'zpdwmc' => $item['zpdwmc'] ?? '', // 招聘单位/用人司局 + 'zwmc' => $item['zwmc'] ?? '', // 职位名称 + 'zwdm' => $item['zwdm'] ?? $zwdm, // 职位代码 + 'zprs' => $zprs, // 招聘人数 + 'bkrs' => $bkrs, // 审核通过人数 + 'competition_ratio' => $competitionRatio, // 竞争比(格式:招聘人数:审核通过人数) + ]; + } + /** * 构建Cookie字符串 - * @param array $cookies Cookie数组 + * @param array|string $cookies Cookie数组或字符串 * @return string */ - private function buildCookieString(array|string $cookies): string + public function buildCookieString(array|string $cookies): string { // 如果直接传入原始Cookie字符串,优先使用 if (is_string($cookies)) {