Files
shengkao_pachong/app/service/CrawlerService.php
杨志 272dbcb424 up
2026-02-02 15:16:36 +08:00

513 lines
18 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
declare (strict_types = 1);
namespace app\service;
/**
* 爬虫服务类
* 用于处理职位信息爬取相关逻辑
*/
class CrawlerService
{
/**
* 应用路径
*/
private const APP_PATH = '/tyzpwb';
/**
* 获取基础URL域名和端口
* @return string
*/
private function getBaseUrlHost(): string
{
// 从配置服务获取BASE_URL
$configService = new \app\service\ConfigService();
return $configService->getBaseUrl();
}
/**
* 获取完整基础URL包含应用路径
* @return string
*/
public function getBaseUrl(): string
{
return $this->getBaseUrlHost() . self::APP_PATH;
}
/**
* 构建HTTP请求头用于HTML页面请求
* @param string $cookieString Cookie字符串
* @param string $referer Referer URL
* @return array
*/
public function buildHtmlHeaders(string $cookieString, string $referer): array
{
return [
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language: zh-CN,zh;q=0.9',
'Accept-Encoding: gzip, deflate',
'Cache-Control: no-cache',
'Connection: keep-alive',
'Cookie: ' . $cookieString,
'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
'Referer: ' . $referer,
'Upgrade-Insecure-Requests: 1',
];
}
/**
* 构建HTTP请求头用于AJAX请求
* @param string $cookieString Cookie字符串
* @param string $referer Referer URL
* @param string $browserType 浏览器类型:'chrome' 或 'firefox',默认为 'chrome'
* @param string|null $origin Origin URL可选Firefox时自动使用BASE_URL
* @param bool $withCharset 是否在Content-Type中包含charset=UTF-8默认为false
* @return array
*/
public function buildAjaxHeaders(string $cookieString, string $referer, string $browserType = 'chrome', ?string $origin = null, bool $withCharset = false): array
{
$isFirefox = $browserType === 'firefox';
$headers = [
'Accept: text/plain, */*',
'Accept-Language: zh-CN,zh;q=0.9,zh-TW;q=0.8,zh-HK;q=0.7,en-US;q=0.6,en;q=0.5',
'Accept-Encoding: gzip, deflate',
'Cache-Control: no-cache',
'Connection: keep-alive',
'Content-Type: application/x-www-form-urlencoded' . ($withCharset ? '; charset=UTF-8' : ''),
'Cookie: ' . $cookieString,
'Referer: ' . $referer,
'X-Requested-With: XMLHttpRequest',
];
// Firefox特有添加Pragma头
if ($isFirefox) {
$headers[] = 'Pragma: no-cache';
}
// User-Agent
if ($isFirefox) {
$headers[] = 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:147.0) Gecko/20100101 Firefox/147.0';
} else {
$headers[] = 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36';
}
// Origin
if ($isFirefox) {
$headers[] = 'Origin: ' . $this->getBaseUrlHost();
} elseif ($origin !== null) {
$headers[] = 'Origin: ' . $origin;
}
return $headers;
}
/**
* 获取职位树数据
* @param string $dsdm 地区代码
* @param string $examid 考试ID
* @param string $bmid 部门ID
* @param string $userid 用户ID
* @param string $aa 时间戳必须与selectPosition页一致
* @param array $cookies Cookie数据
* @return array
*/
public function getPositionTree(string $dsdm, string $examid, string $bmid, string $userid, string $aa, array $cookies): array
{
$baseUrl = $this->getBaseUrl();
$url = $baseUrl . '/tree/getPositionTree.htm';
$cookieString = $this->buildCookieString($cookies);
// 构建Referer URL包含完整参数
$refererUrl = $baseUrl . "/stuchooseexam/selectPosition.htm?examstupid=1015&userid={$userid}&bmid={$bmid}&examid={$examid}&aa={$aa}";
// 构建POST参数
$postData = [
'examid' => $examid,
'bmid' => $bmid,
'userid' => $userid,
'dsdm' => $dsdm
];
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_ENCODING, ''); // 自动处理gzip编码
curl_setopt($ch, CURLOPT_HTTPHEADER, $this->buildAjaxHeaders($cookieString, $refererUrl, 'firefox'));
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($postData));
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$error = curl_error($ch);
curl_close($ch);
if ($error) {
throw new \Exception('请求失败: ' . $error);
}
if ($httpCode !== 200) {
throw new \Exception('请求失败HTTP状态码: ' . $httpCode);
}
$data = json_decode($response, true);
if (json_last_error() !== JSON_ERROR_NONE) {
throw new \Exception('JSON解析失败: ' . json_last_error_msg());
}
return $data ?: [];
}
/**
* 从职位树数据中收集可爬取的职位代码(排除 nocheck: true 的节点)
* 接口返回扁平数组 [ { CODE, TITLE, nocheck?, ... }, ... ]
* @param array $treeData getPositionTree 的返回值(可为裸数组或包装结构)
* @return string[]
*/
public function collectPositionCodesExcludingNocheck(array $treeData): array
{
$codes = [];
$nodes = $treeData;
if (isset($treeData['data']) && is_array($treeData['data'])) {
$nodes = $treeData['data'];
} elseif (isset($treeData['tree']) && is_array($treeData['tree'])) {
$nodes = $treeData['tree'];
}
foreach ($nodes as $item) {
if (!is_array($item)) {
continue;
}
// 有 nocheck 且为真true / "true" / 1则跳过避免把字符串 "false" 当真理
$nocheck = $item['nocheck'] ?? $item['noCheck'] ?? null;
if ($nocheck === true || $nocheck === 1 || (is_string($nocheck) && strtolower($nocheck) === 'true')) {
continue;
}
if (isset($item['CODE']) && $item['CODE'] !== '') {
$codes[] = $item['CODE'];
}
}
return $codes;
}
/**
* 从职位树数据中收集可爬取的职位列表(排除 nocheck: true 的节点),用于下拉/勾选展示
* @param array $treeData getPositionTree 的返回值
* @return array[] 每项为 [ 'zwdm' => CODE, 'title' => TITLE ]
*/
public function collectPositionListExcludingNocheck(array $treeData): array
{
$list = [];
$nodes = $treeData;
if (isset($treeData['data']) && is_array($treeData['data'])) {
$nodes = $treeData['data'];
} elseif (isset($treeData['tree']) && is_array($treeData['tree'])) {
$nodes = $treeData['tree'];
}
foreach ($nodes as $item) {
if (!is_array($item)) {
continue;
}
$nocheck = $item['nocheck'] ?? $item['noCheck'] ?? null;
if ($nocheck === true || $nocheck === 1 || (is_string($nocheck) && strtolower($nocheck) === 'true')) {
continue;
}
if (isset($item['CODE']) && $item['CODE'] !== '') {
$list[] = [
'zwdm' => $item['CODE'],
'title' => $item['TITLE'] ?? $item['CODE'],
];
}
}
return $list;
}
/**
* 获取职位详细信息
* @param string $zwdm 职位代码
* @param string $examid 考试ID
* @param array $cookies Cookie数据
* @return array
*/
public function getPositionInfo(string $zwdm, string $examid, array $cookies): array
{
$baseUrl = $this->getBaseUrl();
$url = $baseUrl . '/stuchooseexam/getPositionInfo.htm';
$cookieString = $this->buildCookieString($cookies);
$referer = $baseUrl . '/stuchooseexam/selectPosition.htm';
$origin = $this->getBaseUrlHost();
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_ENCODING, ''); // 自动处理gzip编码
curl_setopt($ch, CURLOPT_HTTPHEADER, $this->buildAjaxHeaders($cookieString, $referer, 'chrome', $origin, true));
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query([
'zwdm' => $zwdm,
'examid' => $examid
]));
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$error = curl_error($ch);
curl_close($ch);
if ($error) {
throw new \Exception('请求失败: ' . $error);
}
if ($httpCode !== 200) {
throw new \Exception('请求失败HTTP状态码: ' . $httpCode);
}
$data = json_decode($response, true);
if (json_last_error() !== JSON_ERROR_NONE) {
throw new \Exception('JSON解析失败: ' . json_last_error_msg());
}
return $data ?: [];
}
/**
* 批量获取职位信息
* @param array $zwdmList 职位代码列表
* @param string $examid 考试ID
* @param array $cookies Cookie数据
* @return array
*/
public function batchGetPositionInfo(array $zwdmList, string $examid, array $cookies): array
{
$results = [];
foreach ($zwdmList as $zwdm) {
try {
$info = $this->getPositionInfo($zwdm, $examid, $cookies);
if (!empty($info)) {
$results[] = $this->formatPositionInfo($info, $zwdm);
}
// 避免请求过快,添加小延迟
usleep(200000); // 0.2秒
} catch (\Exception $e) {
// 记录错误但继续处理其他数据
$results[] = [
'zwdm' => $zwdm,
'error' => $e->getMessage(),
];
}
}
return $results;
}
/**
* 解析并验证Cookie数据
* @param mixed $cookiesParam Cookie参数可能是字符串或数组
* @return array
* @throws \Exception
*/
public function parseCookies($cookiesParam): array
{
if (empty($cookiesParam)) {
throw new \Exception('请填写Cookie数据');
}
// 解析JSON格式的cookies
$cookies = is_string($cookiesParam) ? json_decode($cookiesParam, true) : $cookiesParam;
if (json_last_error() !== JSON_ERROR_NONE || empty($cookies)) {
throw new \Exception('Cookie数据格式错误请检查JSON格式');
}
return $cookies;
}
/**
* 计算最大公约数GCD
* @param int $a
* @param int $b
* @return int
*/
private function gcd(int $a, int $b): int
{
while ($b != 0) {
$temp = $b;
$b = $a % $b;
$a = $temp;
}
return $a;
}
/**
* 计算竞争比格式1:比例,四舍五入取整,无小数)
* @param int $zprs 招聘人数
* @param int $bkrs 审核通过人数
* @return string
*/
private function calculateCompetitionRatio(int $zprs, int $bkrs): string
{
if ($zprs <= 0) {
return '0:0';
}
if ($bkrs <= 0) {
return '1:0';
}
// 将招聘人数简化为1计算审核通过人数与招聘人数的比例
$ratio = $bkrs / $zprs;
$ratioRounded = (int) round($ratio, 0, PHP_ROUND_HALF_UP);
// 防止四舍五入后为0的极端情况
if ($ratioRounded < 0) {
$ratioRounded = 0;
}
return '1:' . $ratioRounded;
}
/**
* 格式化职位信息(包含竞争比计算)
* @param array $item 原始职位数据
* @param string $zwdm 职位代码(备用)
* @return array
*/
public function formatPositionInfo(array $item, string $zwdm = ''): array
{
// 处理单条数据或数组数据
if (isset($item[0])) {
$item = $item[0];
}
// 获取招聘人数和审核通过人数
$zprs = isset($item['zprs']) ? intval($item['zprs']) : 0;
$bkrs = isset($item['bkrs']) ? intval($item['bkrs']) : 0;
// 计算竞争比格式1:比例保留2位小数
$competitionRatio = $this->calculateCompetitionRatio($zprs, $bkrs);
return [
'sbmc' => $item['sbmc'] ?? '', // 省份
'dsmc' => $item['dsmc'] ?? '', // 地区
'zpdwmc' => $item['zpdwmc'] ?? '', // 招聘单位/用人司局
'zwmc' => $item['zwmc'] ?? '', // 职位名称
'zwdm' => $item['zwdm'] ?? $zwdm, // 职位代码
'zprs' => $zprs, // 招聘人数
'bkrs' => $bkrs, // 审核通过人数
'competition_ratio' => $competitionRatio, // 竞争比格式1:比例保留2位小数
];
}
/**
* 构建Cookie字符串
* @param array|string $cookies Cookie数组或字符串
* @return string
*/
public function buildCookieString(array|string $cookies): string
{
// 如果直接传入原始Cookie字符串优先使用
if (is_string($cookies)) {
return trim($cookies);
}
// 只保留用户填写的Cookie支持同名键多值如双JSESSIONID
$cookieData = $cookies['请求 Cookie'] ?? $cookies;
$normalized = [];
foreach ($cookieData as $key => $value) {
$k = trim((string)$key);
if ($k === '') {
continue;
}
$vals = is_array($value) ? $value : [$value];
foreach ($vals as $v) {
$v = trim((string)$v);
if ($v === '') {
continue;
}
$normalized[$k][] = $v;
}
}
// 将 JSESSIONID 放在最前(如果存在),其余按键名顺序,多值全部保留
$parts = [];
if (isset($normalized['JSESSIONID'])) {
foreach ($normalized['JSESSIONID'] as $v) {
$parts[] = 'JSESSIONID=' . $v;
}
unset($normalized['JSESSIONID']);
}
foreach ($normalized as $k => $vArr) {
foreach ($vArr as $v) {
$parts[] = $k . '=' . $v;
}
}
return implode('; ', $parts);
}
/**
* 从HTML中提取地区选项仅从地市/地区相关的 select 中提取,并过滤占位项与乱码)
* @param string $html HTML内容
* @return array
*/
public function extractDsdmOptions(string $html): array
{
$options = [];
$placeholderValues = ['0', '-1', ''];
$placeholderTexts = ['请选择', '请选择地区', '请选择地市', '全部', ''];
// 先尝试只匹配「地市」相关 select 内的 option减少误匹配
if (preg_match('/<select[^>]*地市[^>]*>.*?<\/select>/is', $html, $selectBlock)) {
$html = $selectBlock[0];
}
preg_match_all('/<option\s+value="([^"]*)"[^>]*>([^<]*)<\/option>/i', $html, $matches);
if (!empty($matches[1])) {
foreach ($matches[1] as $index => $value) {
$value = trim($value);
$text = trim($matches[2][$index] ?? $value);
if (in_array($value, $placeholderValues, true)) {
continue;
}
if ($text === '' || in_array($text, $placeholderTexts, true)) {
continue;
}
// 过滤乱码:含控制字符或非 UTF-8 的视为无效
if (preg_match('/[\x00-\x08\x0B\x0C\x0E-\x1F]/', $text) || !mb_check_encoding($text, 'UTF-8')) {
continue;
}
// 过滤明显非地区名称的短乱码(如单字符乱码)
if (mb_strlen($text) < 2 && !preg_match('/^[\x{4e00}-\x{9fa5}A-Za-z0-9]+$/u', $text)) {
continue;
}
$options[] = [
'value' => $value,
'text' => $text,
];
}
}
return $options;
}
}