getBaseUrl(); } /** * 获取完整基础URL(包含应用路径) * @return string */ public function getBaseUrl(): string { return $this->getBaseUrlHost() . self::APP_PATH; } /** * 构建HTTP请求头(用于HTML页面请求) * @param string $cookieString Cookie字符串 * @param string $referer Referer URL * @return array */ public function buildHtmlHeaders(string $cookieString, string $referer): array { return [ 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language: zh-CN,zh;q=0.9', 'Accept-Encoding: gzip, deflate', 'Cache-Control: no-cache', 'Connection: keep-alive', 'Cookie: ' . $cookieString, 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36', 'Referer: ' . $referer, 'Upgrade-Insecure-Requests: 1', ]; } /** * 构建HTTP请求头(用于AJAX请求) * @param string $cookieString Cookie字符串 * @param string $referer Referer URL * @param string $browserType 浏览器类型:'chrome' 或 'firefox',默认为 'chrome' * @param string|null $origin Origin URL(可选,Firefox时自动使用BASE_URL) * @param bool $withCharset 是否在Content-Type中包含charset=UTF-8,默认为false * @return array */ public function buildAjaxHeaders(string $cookieString, string $referer, string $browserType = 'chrome', ?string $origin = null, bool $withCharset = false): array { $isFirefox = $browserType === 'firefox'; $headers = [ 'Accept: text/plain, */*', 'Accept-Language: zh-CN,zh;q=0.9,zh-TW;q=0.8,zh-HK;q=0.7,en-US;q=0.6,en;q=0.5', 'Accept-Encoding: gzip, deflate', 'Cache-Control: no-cache', 'Connection: keep-alive', 'Content-Type: application/x-www-form-urlencoded' . ($withCharset ? '; charset=UTF-8' : ''), 'Cookie: ' . $cookieString, 'Referer: ' . $referer, 'X-Requested-With: XMLHttpRequest', ]; // Firefox特有:添加Pragma头 if ($isFirefox) { $headers[] = 'Pragma: no-cache'; } // User-Agent if ($isFirefox) { $headers[] = 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:147.0) Gecko/20100101 Firefox/147.0'; } else { $headers[] = 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'; } // Origin if ($isFirefox) { $headers[] = 'Origin: ' . $this->getBaseUrlHost(); } elseif ($origin !== null) { $headers[] = 'Origin: ' . $origin; } return $headers; } /** * 获取职位树数据 * @param string $dsdm 地区代码 * @param string $examid 考试ID * @param string $bmid 部门ID * @param string $userid 用户ID * @param string $aa 时间戳(必须与selectPosition页一致) * @param array $cookies Cookie数据 * @return array */ public function getPositionTree(string $dsdm, string $examid, string $bmid, string $userid, string $aa, array $cookies): array { $baseUrl = $this->getBaseUrl(); $url = $baseUrl . '/tree/getPositionTree.htm'; $cookieString = $this->buildCookieString($cookies); // 构建Referer URL(包含完整参数) $refererUrl = $baseUrl . "/stuchooseexam/selectPosition.htm?examstupid=1015&userid={$userid}&bmid={$bmid}&examid={$examid}&aa={$aa}"; // 构建POST参数 $postData = [ 'examid' => $examid, 'bmid' => $bmid, 'userid' => $userid, 'dsdm' => $dsdm ]; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); curl_setopt($ch, CURLOPT_ENCODING, ''); // 自动处理gzip编码 curl_setopt($ch, CURLOPT_HTTPHEADER, $this->buildAjaxHeaders($cookieString, $refererUrl, 'firefox')); curl_setopt($ch, CURLOPT_POST, true); curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($postData)); curl_setopt($ch, CURLOPT_TIMEOUT, 30); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10); $response = curl_exec($ch); $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); $error = curl_error($ch); curl_close($ch); if ($error) { throw new \Exception('请求失败: ' . $error); } if ($httpCode !== 200) { throw new \Exception('请求失败,HTTP状态码: ' . $httpCode); } $data = json_decode($response, true); if (json_last_error() !== JSON_ERROR_NONE) { throw new \Exception('JSON解析失败: ' . json_last_error_msg()); } return $data ?: []; } /** * 从职位树数据中收集可爬取的职位代码(排除 nocheck: true 的节点) * 接口返回扁平数组 [ { CODE, TITLE, nocheck?, ... }, ... ] * @param array $treeData getPositionTree 的返回值(可为裸数组或包装结构) * @return string[] */ public function collectPositionCodesExcludingNocheck(array $treeData): array { $codes = []; $nodes = $treeData; if (isset($treeData['data']) && is_array($treeData['data'])) { $nodes = $treeData['data']; } elseif (isset($treeData['tree']) && is_array($treeData['tree'])) { $nodes = $treeData['tree']; } foreach ($nodes as $item) { if (!is_array($item)) { continue; } // 有 nocheck 且为真(true / "true" / 1)则跳过;避免把字符串 "false" 当真理 $nocheck = $item['nocheck'] ?? $item['noCheck'] ?? null; if ($nocheck === true || $nocheck === 1 || (is_string($nocheck) && strtolower($nocheck) === 'true')) { continue; } if (isset($item['CODE']) && $item['CODE'] !== '') { $codes[] = $item['CODE']; } } return $codes; } /** * 获取职位详细信息 * @param string $zwdm 职位代码 * @param string $examid 考试ID * @param array $cookies Cookie数据 * @return array */ public function getPositionInfo(string $zwdm, string $examid, array $cookies): array { $baseUrl = $this->getBaseUrl(); $url = $baseUrl . '/stuchooseexam/getPositionInfo.htm'; $cookieString = $this->buildCookieString($cookies); $referer = $baseUrl . '/stuchooseexam/selectPosition.htm'; $origin = $this->getBaseUrlHost(); $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); curl_setopt($ch, CURLOPT_ENCODING, ''); // 自动处理gzip编码 curl_setopt($ch, CURLOPT_HTTPHEADER, $this->buildAjaxHeaders($cookieString, $referer, 'chrome', $origin, true)); curl_setopt($ch, CURLOPT_POST, true); curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query([ 'zwdm' => $zwdm, 'examid' => $examid ])); curl_setopt($ch, CURLOPT_TIMEOUT, 30); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10); $response = curl_exec($ch); $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); $error = curl_error($ch); curl_close($ch); if ($error) { throw new \Exception('请求失败: ' . $error); } if ($httpCode !== 200) { throw new \Exception('请求失败,HTTP状态码: ' . $httpCode); } $data = json_decode($response, true); if (json_last_error() !== JSON_ERROR_NONE) { throw new \Exception('JSON解析失败: ' . json_last_error_msg()); } return $data ?: []; } /** * 批量获取职位信息 * @param array $zwdmList 职位代码列表 * @param string $examid 考试ID * @param array $cookies Cookie数据 * @return array */ public function batchGetPositionInfo(array $zwdmList, string $examid, array $cookies): array { $results = []; foreach ($zwdmList as $zwdm) { try { $info = $this->getPositionInfo($zwdm, $examid, $cookies); if (!empty($info)) { $results[] = $this->formatPositionInfo($info, $zwdm); } // 避免请求过快,添加小延迟 usleep(200000); // 0.2秒 } catch (\Exception $e) { // 记录错误但继续处理其他数据 $results[] = [ 'zwdm' => $zwdm, 'error' => $e->getMessage(), ]; } } return $results; } /** * 解析并验证Cookie数据 * @param mixed $cookiesParam Cookie参数(可能是字符串或数组) * @return array * @throws \Exception */ public function parseCookies($cookiesParam): array { if (empty($cookiesParam)) { throw new \Exception('请填写Cookie数据'); } // 解析JSON格式的cookies $cookies = is_string($cookiesParam) ? json_decode($cookiesParam, true) : $cookiesParam; if (json_last_error() !== JSON_ERROR_NONE || empty($cookies)) { throw new \Exception('Cookie数据格式错误,请检查JSON格式'); } return $cookies; } /** * 计算最大公约数(GCD) * @param int $a * @param int $b * @return int */ private function gcd(int $a, int $b): int { while ($b != 0) { $temp = $b; $b = $a % $b; $a = $temp; } return $a; } /** * 计算竞争比(格式:1:比例,四舍五入取整,无小数) * @param int $zprs 招聘人数 * @param int $bkrs 审核通过人数 * @return string */ private function calculateCompetitionRatio(int $zprs, int $bkrs): string { if ($zprs <= 0) { return '0:0'; } if ($bkrs <= 0) { return '1:0'; } // 将招聘人数简化为1,计算审核通过人数与招聘人数的比例 $ratio = $bkrs / $zprs; $ratioRounded = (int) round($ratio, 0, PHP_ROUND_HALF_UP); // 防止四舍五入后为0的极端情况 if ($ratioRounded < 0) { $ratioRounded = 0; } return '1:' . $ratioRounded; } /** * 格式化职位信息(包含竞争比计算) * @param array $item 原始职位数据 * @param string $zwdm 职位代码(备用) * @return array */ public function formatPositionInfo(array $item, string $zwdm = ''): array { // 处理单条数据或数组数据 if (isset($item[0])) { $item = $item[0]; } // 获取招聘人数和审核通过人数 $zprs = isset($item['zprs']) ? intval($item['zprs']) : 0; $bkrs = isset($item['bkrs']) ? intval($item['bkrs']) : 0; // 计算竞争比(格式:1:比例,保留2位小数) $competitionRatio = $this->calculateCompetitionRatio($zprs, $bkrs); return [ 'sbmc' => $item['sbmc'] ?? '', // 省份 'dsmc' => $item['dsmc'] ?? '', // 地区 'zpdwmc' => $item['zpdwmc'] ?? '', // 招聘单位/用人司局 'zwmc' => $item['zwmc'] ?? '', // 职位名称 'zwdm' => $item['zwdm'] ?? $zwdm, // 职位代码 'zprs' => $zprs, // 招聘人数 'bkrs' => $bkrs, // 审核通过人数 'competition_ratio' => $competitionRatio, // 竞争比(格式:1:比例,保留2位小数) ]; } /** * 构建Cookie字符串 * @param array|string $cookies Cookie数组或字符串 * @return string */ public function buildCookieString(array|string $cookies): string { // 如果直接传入原始Cookie字符串,优先使用 if (is_string($cookies)) { return trim($cookies); } // 只保留用户填写的Cookie,支持同名键多值(如双JSESSIONID) $cookieData = $cookies['请求 Cookie'] ?? $cookies; $normalized = []; foreach ($cookieData as $key => $value) { $k = trim((string)$key); if ($k === '') { continue; } $vals = is_array($value) ? $value : [$value]; foreach ($vals as $v) { $v = trim((string)$v); if ($v === '') { continue; } $normalized[$k][] = $v; } } // 将 JSESSIONID 放在最前(如果存在),其余按键名顺序,多值全部保留 $parts = []; if (isset($normalized['JSESSIONID'])) { foreach ($normalized['JSESSIONID'] as $v) { $parts[] = 'JSESSIONID=' . $v; } unset($normalized['JSESSIONID']); } foreach ($normalized as $k => $vArr) { foreach ($vArr as $v) { $parts[] = $k . '=' . $v; } } return implode('; ', $parts); } /** * 从HTML中提取地区选项(仅从地市/地区相关的 select 中提取,并过滤占位项与乱码) * @param string $html HTML内容 * @return array */ public function extractDsdmOptions(string $html): array { $options = []; $placeholderValues = ['0', '-1', '']; $placeholderTexts = ['请选择', '请选择地区', '请选择地市', '全部', '']; // 先尝试只匹配「地市」相关 select 内的 option,减少误匹配 if (preg_match('/]*地市[^>]*>.*?<\/select>/is', $html, $selectBlock)) { $html = $selectBlock[0]; } preg_match_all('/]*>([^<]*)<\/option>/i', $html, $matches); if (!empty($matches[1])) { foreach ($matches[1] as $index => $value) { $value = trim($value); $text = trim($matches[2][$index] ?? $value); if (in_array($value, $placeholderValues, true)) { continue; } if ($text === '' || in_array($text, $placeholderTexts, true)) { continue; } // 过滤乱码:含控制字符或非 UTF-8 的视为无效 if (preg_match('/[\x00-\x08\x0B\x0C\x0E-\x1F]/', $text) || !mb_check_encoding($text, 'UTF-8')) { continue; } // 过滤明显非地区名称的短乱码(如单字符乱码) if (mb_strlen($text) < 2 && !preg_match('/^[\x{4e00}-\x{9fa5}A-Za-z0-9]+$/u', $text)) { continue; } $options[] = [ 'value' => $value, 'text' => $text, ]; } } return $options; } }