Files
shengkao_pachong/app/service/CrawlerService.php
杨志 a962e06a18 up
2026-01-21 08:53:45 +08:00

382 lines
13 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
declare (strict_types = 1);
namespace app\service;
/**
* 爬虫服务类
* 用于处理职位信息爬取相关逻辑
*/
class CrawlerService
{
/**
* 应用路径
*/
private const APP_PATH = '/tyzpwb';
/**
* 获取基础URL域名和端口
* @return string
*/
private function getBaseUrlHost(): string
{
// 从配置服务获取BASE_URL
$configService = new \app\service\ConfigService();
return $configService->getBaseUrl();
}
/**
* 获取完整基础URL包含应用路径
* @return string
*/
public function getBaseUrl(): string
{
return $this->getBaseUrlHost() . self::APP_PATH;
}
/**
* 构建HTTP请求头用于HTML页面请求
* @param string $cookieString Cookie字符串
* @param string $referer Referer URL
* @return array
*/
public function buildHtmlHeaders(string $cookieString, string $referer): array
{
return [
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language: zh-CN,zh;q=0.9',
'Accept-Encoding: gzip, deflate',
'Cache-Control: no-cache',
'Connection: keep-alive',
'Cookie: ' . $cookieString,
'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
'Referer: ' . $referer,
'Upgrade-Insecure-Requests: 1',
];
}
/**
* 构建HTTP请求头用于AJAX请求
* @param string $cookieString Cookie字符串
* @param string $referer Referer URL
* @param string $browserType 浏览器类型:'chrome' 或 'firefox',默认为 'chrome'
* @param string|null $origin Origin URL可选Firefox时自动使用BASE_URL
* @param bool $withCharset 是否在Content-Type中包含charset=UTF-8默认为false
* @return array
*/
public function buildAjaxHeaders(string $cookieString, string $referer, string $browserType = 'chrome', ?string $origin = null, bool $withCharset = false): array
{
$isFirefox = $browserType === 'firefox';
$headers = [
'Accept: text/plain, */*',
'Accept-Language: zh-CN,zh;q=0.9,zh-TW;q=0.8,zh-HK;q=0.7,en-US;q=0.6,en;q=0.5',
'Accept-Encoding: gzip, deflate',
'Cache-Control: no-cache',
'Connection: keep-alive',
'Content-Type: application/x-www-form-urlencoded' . ($withCharset ? '; charset=UTF-8' : ''),
'Cookie: ' . $cookieString,
'Referer: ' . $referer,
'X-Requested-With: XMLHttpRequest',
];
// Firefox特有添加Pragma头
if ($isFirefox) {
$headers[] = 'Pragma: no-cache';
}
// User-Agent
if ($isFirefox) {
$headers[] = 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:147.0) Gecko/20100101 Firefox/147.0';
} else {
$headers[] = 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36';
}
// Origin
if ($isFirefox) {
$headers[] = 'Origin: ' . $this->getBaseUrlHost();
} elseif ($origin !== null) {
$headers[] = 'Origin: ' . $origin;
}
return $headers;
}
/**
* 获取职位树数据
* @param string $dsdm 地区代码
* @param string $examid 考试ID
* @param string $bmid 部门ID
* @param string $userid 用户ID
* @param string $aa 时间戳必须与selectPosition页一致
* @param array $cookies Cookie数据
* @return array
*/
public function getPositionTree(string $dsdm, string $examid, string $bmid, string $userid, string $aa, array $cookies): array
{
$baseUrl = $this->getBaseUrl();
$url = $baseUrl . '/tree/getPositionTree.htm';
$cookieString = $this->buildCookieString($cookies);
// 构建Referer URL包含完整参数
$refererUrl = $baseUrl . "/stuchooseexam/selectPosition.htm?examstupid=1015&userid={$userid}&bmid={$bmid}&examid={$examid}&aa={$aa}";
// 构建POST参数
$postData = [
'examid' => $examid,
'bmid' => $bmid,
'userid' => $userid,
'dsdm' => $dsdm
];
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_ENCODING, ''); // 自动处理gzip编码
curl_setopt($ch, CURLOPT_HTTPHEADER, $this->buildAjaxHeaders($cookieString, $refererUrl, 'firefox'));
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($postData));
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$error = curl_error($ch);
curl_close($ch);
if ($error) {
throw new \Exception('请求失败: ' . $error);
}
if ($httpCode !== 200) {
throw new \Exception('请求失败HTTP状态码: ' . $httpCode);
}
$data = json_decode($response, true);
if (json_last_error() !== JSON_ERROR_NONE) {
throw new \Exception('JSON解析失败: ' . json_last_error_msg());
}
return $data ?: [];
}
/**
* 获取职位详细信息
* @param string $zwdm 职位代码
* @param string $examid 考试ID
* @param array $cookies Cookie数据
* @return array
*/
public function getPositionInfo(string $zwdm, string $examid, array $cookies): array
{
$baseUrl = $this->getBaseUrl();
$url = $baseUrl . '/stuchooseexam/getPositionInfo.htm';
$cookieString = $this->buildCookieString($cookies);
$referer = $baseUrl . '/stuchooseexam/selectPosition.htm';
$origin = $this->getBaseUrlHost();
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_ENCODING, ''); // 自动处理gzip编码
curl_setopt($ch, CURLOPT_HTTPHEADER, $this->buildAjaxHeaders($cookieString, $referer, 'chrome', $origin, true));
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query([
'zwdm' => $zwdm,
'examid' => $examid
]));
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$error = curl_error($ch);
curl_close($ch);
if ($error) {
throw new \Exception('请求失败: ' . $error);
}
if ($httpCode !== 200) {
throw new \Exception('请求失败HTTP状态码: ' . $httpCode);
}
$data = json_decode($response, true);
if (json_last_error() !== JSON_ERROR_NONE) {
throw new \Exception('JSON解析失败: ' . json_last_error_msg());
}
return $data ?: [];
}
/**
* 批量获取职位信息
* @param array $zwdmList 职位代码列表
* @param string $examid 考试ID
* @param array $cookies Cookie数据
* @return array
*/
public function batchGetPositionInfo(array $zwdmList, string $examid, array $cookies): array
{
$results = [];
foreach ($zwdmList as $zwdm) {
try {
$info = $this->getPositionInfo($zwdm, $examid, $cookies);
if (!empty($info)) {
$results[] = $this->formatPositionInfo($info, $zwdm);
}
// 避免请求过快,添加小延迟
usleep(200000); // 0.2秒
} catch (\Exception $e) {
// 记录错误但继续处理其他数据
$results[] = [
'zwdm' => $zwdm,
'error' => $e->getMessage(),
];
}
}
return $results;
}
/**
* 解析并验证Cookie数据
* @param mixed $cookiesParam Cookie参数可能是字符串或数组
* @return array
* @throws \Exception
*/
public function parseCookies($cookiesParam): array
{
if (empty($cookiesParam)) {
throw new \Exception('请填写Cookie数据');
}
// 解析JSON格式的cookies
$cookies = is_string($cookiesParam) ? json_decode($cookiesParam, true) : $cookiesParam;
if (json_last_error() !== JSON_ERROR_NONE || empty($cookies)) {
throw new \Exception('Cookie数据格式错误请检查JSON格式');
}
return $cookies;
}
/**
* 格式化职位信息(包含竞争比计算)
* @param array $item 原始职位数据
* @param string $zwdm 职位代码(备用)
* @return array
*/
public function formatPositionInfo(array $item, string $zwdm = ''): array
{
// 处理单条数据或数组数据
if (isset($item[0])) {
$item = $item[0];
}
// 计算竞争比(格式:招聘人数:审核通过人数)
$zprs = isset($item['zprs']) ? intval($item['zprs']) : 0;
$bkrs = isset($item['bkrs']) ? intval($item['bkrs']) : 0;
$competitionRatio = $zprs > 0 && $bkrs > 0 ? $zprs . ':' . $bkrs : ($zprs > 0 ? $zprs . ':0' : '0:0');
return [
'sbmc' => $item['sbmc'] ?? '', // 省份
'dsmc' => $item['dsmc'] ?? '', // 地区
'zpdwmc' => $item['zpdwmc'] ?? '', // 招聘单位/用人司局
'zwmc' => $item['zwmc'] ?? '', // 职位名称
'zwdm' => $item['zwdm'] ?? $zwdm, // 职位代码
'zprs' => $zprs, // 招聘人数
'bkrs' => $bkrs, // 审核通过人数
'competition_ratio' => $competitionRatio, // 竞争比(格式:招聘人数:审核通过人数)
];
}
/**
* 构建Cookie字符串
* @param array|string $cookies Cookie数组或字符串
* @return string
*/
public function buildCookieString(array|string $cookies): string
{
// 如果直接传入原始Cookie字符串优先使用
if (is_string($cookies)) {
return trim($cookies);
}
// 只保留用户填写的Cookie支持同名键多值如双JSESSIONID
$cookieData = $cookies['请求 Cookie'] ?? $cookies;
$normalized = [];
foreach ($cookieData as $key => $value) {
$k = trim((string)$key);
if ($k === '') {
continue;
}
$vals = is_array($value) ? $value : [$value];
foreach ($vals as $v) {
$v = trim((string)$v);
if ($v === '') {
continue;
}
$normalized[$k][] = $v;
}
}
// 将 JSESSIONID 放在最前(如果存在),其余按键名顺序,多值全部保留
$parts = [];
if (isset($normalized['JSESSIONID'])) {
foreach ($normalized['JSESSIONID'] as $v) {
$parts[] = 'JSESSIONID=' . $v;
}
unset($normalized['JSESSIONID']);
}
foreach ($normalized as $k => $vArr) {
foreach ($vArr as $v) {
$parts[] = $k . '=' . $v;
}
}
return implode('; ', $parts);
}
/**
* 从HTML中提取地区选项
* @param string $html HTML内容
* @return array
*/
public function extractDsdmOptions(string $html): array
{
$options = [];
// 使用正则表达式提取option标签
preg_match_all('/<option\s+value="([^"]+)"[^>]*>([^<]+)<\/option>/i', $html, $matches);
if (!empty($matches[1])) {
foreach ($matches[1] as $index => $value) {
if (!empty($value)) { // 跳过"请选择"等空值
$options[] = [
'value' => $value,
'text' => $matches[2][$index] ?? $value,
];
}
}
}
return $options;
}
}