430 lines
14 KiB
PHP
430 lines
14 KiB
PHP
<?php
|
||
declare (strict_types = 1);
|
||
|
||
namespace app\service;
|
||
|
||
/**
|
||
* 爬虫服务类
|
||
* 用于处理职位信息爬取相关逻辑
|
||
*/
|
||
class CrawlerService
|
||
{
|
||
/**
|
||
* 应用路径
|
||
*/
|
||
private const APP_PATH = '/tyzpwb';
|
||
|
||
/**
|
||
* 获取基础URL(域名和端口)
|
||
* @return string
|
||
*/
|
||
private function getBaseUrlHost(): string
|
||
{
|
||
// 从配置服务获取BASE_URL
|
||
$configService = new \app\service\ConfigService();
|
||
return $configService->getBaseUrl();
|
||
}
|
||
|
||
/**
|
||
* 获取完整基础URL(包含应用路径)
|
||
* @return string
|
||
*/
|
||
public function getBaseUrl(): string
|
||
{
|
||
return $this->getBaseUrlHost() . self::APP_PATH;
|
||
}
|
||
|
||
/**
|
||
* 构建HTTP请求头(用于HTML页面请求)
|
||
* @param string $cookieString Cookie字符串
|
||
* @param string $referer Referer URL
|
||
* @return array
|
||
*/
|
||
public function buildHtmlHeaders(string $cookieString, string $referer): array
|
||
{
|
||
return [
|
||
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||
'Accept-Language: zh-CN,zh;q=0.9',
|
||
'Accept-Encoding: gzip, deflate',
|
||
'Cache-Control: no-cache',
|
||
'Connection: keep-alive',
|
||
'Cookie: ' . $cookieString,
|
||
'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
|
||
'Referer: ' . $referer,
|
||
'Upgrade-Insecure-Requests: 1',
|
||
];
|
||
}
|
||
|
||
/**
|
||
* 构建HTTP请求头(用于AJAX请求)
|
||
* @param string $cookieString Cookie字符串
|
||
* @param string $referer Referer URL
|
||
* @param string $browserType 浏览器类型:'chrome' 或 'firefox',默认为 'chrome'
|
||
* @param string|null $origin Origin URL(可选,Firefox时自动使用BASE_URL)
|
||
* @param bool $withCharset 是否在Content-Type中包含charset=UTF-8,默认为false
|
||
* @return array
|
||
*/
|
||
public function buildAjaxHeaders(string $cookieString, string $referer, string $browserType = 'chrome', ?string $origin = null, bool $withCharset = false): array
|
||
{
|
||
$isFirefox = $browserType === 'firefox';
|
||
|
||
$headers = [
|
||
'Accept: text/plain, */*',
|
||
'Accept-Language: zh-CN,zh;q=0.9,zh-TW;q=0.8,zh-HK;q=0.7,en-US;q=0.6,en;q=0.5',
|
||
'Accept-Encoding: gzip, deflate',
|
||
'Cache-Control: no-cache',
|
||
'Connection: keep-alive',
|
||
'Content-Type: application/x-www-form-urlencoded' . ($withCharset ? '; charset=UTF-8' : ''),
|
||
'Cookie: ' . $cookieString,
|
||
'Referer: ' . $referer,
|
||
'X-Requested-With: XMLHttpRequest',
|
||
];
|
||
|
||
// Firefox特有:添加Pragma头
|
||
if ($isFirefox) {
|
||
$headers[] = 'Pragma: no-cache';
|
||
}
|
||
|
||
// User-Agent
|
||
if ($isFirefox) {
|
||
$headers[] = 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:147.0) Gecko/20100101 Firefox/147.0';
|
||
} else {
|
||
$headers[] = 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36';
|
||
}
|
||
|
||
// Origin
|
||
if ($isFirefox) {
|
||
$headers[] = 'Origin: ' . $this->getBaseUrlHost();
|
||
} elseif ($origin !== null) {
|
||
$headers[] = 'Origin: ' . $origin;
|
||
}
|
||
|
||
return $headers;
|
||
}
|
||
|
||
/**
|
||
* 获取职位树数据
|
||
* @param string $dsdm 地区代码
|
||
* @param string $examid 考试ID
|
||
* @param string $bmid 部门ID
|
||
* @param string $userid 用户ID
|
||
* @param string $aa 时间戳(必须与selectPosition页一致)
|
||
* @param array $cookies Cookie数据
|
||
* @return array
|
||
*/
|
||
public function getPositionTree(string $dsdm, string $examid, string $bmid, string $userid, string $aa, array $cookies): array
|
||
{
|
||
$baseUrl = $this->getBaseUrl();
|
||
$url = $baseUrl . '/tree/getPositionTree.htm';
|
||
|
||
$cookieString = $this->buildCookieString($cookies);
|
||
|
||
// 构建Referer URL(包含完整参数)
|
||
$refererUrl = $baseUrl . "/stuchooseexam/selectPosition.htm?examstupid=1015&userid={$userid}&bmid={$bmid}&examid={$examid}&aa={$aa}";
|
||
|
||
// 构建POST参数
|
||
$postData = [
|
||
'examid' => $examid,
|
||
'bmid' => $bmid,
|
||
'userid' => $userid,
|
||
'dsdm' => $dsdm
|
||
];
|
||
|
||
$ch = curl_init();
|
||
curl_setopt($ch, CURLOPT_URL, $url);
|
||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
|
||
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
||
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
|
||
curl_setopt($ch, CURLOPT_ENCODING, ''); // 自动处理gzip编码
|
||
curl_setopt($ch, CURLOPT_HTTPHEADER, $this->buildAjaxHeaders($cookieString, $refererUrl, 'firefox'));
|
||
curl_setopt($ch, CURLOPT_POST, true);
|
||
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($postData));
|
||
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
|
||
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
|
||
|
||
$response = curl_exec($ch);
|
||
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||
$error = curl_error($ch);
|
||
curl_close($ch);
|
||
|
||
if ($error) {
|
||
throw new \Exception('请求失败: ' . $error);
|
||
}
|
||
|
||
if ($httpCode !== 200) {
|
||
throw new \Exception('请求失败,HTTP状态码: ' . $httpCode);
|
||
}
|
||
|
||
$data = json_decode($response, true);
|
||
|
||
if (json_last_error() !== JSON_ERROR_NONE) {
|
||
throw new \Exception('JSON解析失败: ' . json_last_error_msg());
|
||
}
|
||
|
||
return $data ?: [];
|
||
}
|
||
|
||
/**
|
||
* 获取职位详细信息
|
||
* @param string $zwdm 职位代码
|
||
* @param string $examid 考试ID
|
||
* @param array $cookies Cookie数据
|
||
* @return array
|
||
*/
|
||
public function getPositionInfo(string $zwdm, string $examid, array $cookies): array
|
||
{
|
||
$baseUrl = $this->getBaseUrl();
|
||
$url = $baseUrl . '/stuchooseexam/getPositionInfo.htm';
|
||
|
||
$cookieString = $this->buildCookieString($cookies);
|
||
|
||
$referer = $baseUrl . '/stuchooseexam/selectPosition.htm';
|
||
$origin = $this->getBaseUrlHost();
|
||
|
||
$ch = curl_init();
|
||
curl_setopt($ch, CURLOPT_URL, $url);
|
||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
|
||
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
|
||
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
|
||
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
|
||
curl_setopt($ch, CURLOPT_ENCODING, ''); // 自动处理gzip编码
|
||
curl_setopt($ch, CURLOPT_HTTPHEADER, $this->buildAjaxHeaders($cookieString, $referer, 'chrome', $origin, true));
|
||
curl_setopt($ch, CURLOPT_POST, true);
|
||
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query([
|
||
'zwdm' => $zwdm,
|
||
'examid' => $examid
|
||
]));
|
||
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
|
||
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
|
||
|
||
$response = curl_exec($ch);
|
||
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
||
$error = curl_error($ch);
|
||
curl_close($ch);
|
||
|
||
if ($error) {
|
||
throw new \Exception('请求失败: ' . $error);
|
||
}
|
||
|
||
if ($httpCode !== 200) {
|
||
throw new \Exception('请求失败,HTTP状态码: ' . $httpCode);
|
||
}
|
||
|
||
$data = json_decode($response, true);
|
||
|
||
if (json_last_error() !== JSON_ERROR_NONE) {
|
||
throw new \Exception('JSON解析失败: ' . json_last_error_msg());
|
||
}
|
||
|
||
return $data ?: [];
|
||
}
|
||
|
||
/**
|
||
* 批量获取职位信息
|
||
* @param array $zwdmList 职位代码列表
|
||
* @param string $examid 考试ID
|
||
* @param array $cookies Cookie数据
|
||
* @return array
|
||
*/
|
||
public function batchGetPositionInfo(array $zwdmList, string $examid, array $cookies): array
|
||
{
|
||
$results = [];
|
||
|
||
foreach ($zwdmList as $zwdm) {
|
||
try {
|
||
$info = $this->getPositionInfo($zwdm, $examid, $cookies);
|
||
|
||
if (!empty($info)) {
|
||
$results[] = $this->formatPositionInfo($info, $zwdm);
|
||
}
|
||
|
||
// 避免请求过快,添加小延迟
|
||
usleep(200000); // 0.2秒
|
||
|
||
} catch (\Exception $e) {
|
||
// 记录错误但继续处理其他数据
|
||
$results[] = [
|
||
'zwdm' => $zwdm,
|
||
'error' => $e->getMessage(),
|
||
];
|
||
}
|
||
}
|
||
|
||
return $results;
|
||
}
|
||
|
||
/**
|
||
* 解析并验证Cookie数据
|
||
* @param mixed $cookiesParam Cookie参数(可能是字符串或数组)
|
||
* @return array
|
||
* @throws \Exception
|
||
*/
|
||
public function parseCookies($cookiesParam): array
|
||
{
|
||
if (empty($cookiesParam)) {
|
||
throw new \Exception('请填写Cookie数据');
|
||
}
|
||
|
||
// 解析JSON格式的cookies
|
||
$cookies = is_string($cookiesParam) ? json_decode($cookiesParam, true) : $cookiesParam;
|
||
if (json_last_error() !== JSON_ERROR_NONE || empty($cookies)) {
|
||
throw new \Exception('Cookie数据格式错误,请检查JSON格式');
|
||
}
|
||
|
||
return $cookies;
|
||
}
|
||
|
||
/**
|
||
* 计算最大公约数(GCD)
|
||
* @param int $a
|
||
* @param int $b
|
||
* @return int
|
||
*/
|
||
private function gcd(int $a, int $b): int
|
||
{
|
||
while ($b != 0) {
|
||
$temp = $b;
|
||
$b = $a % $b;
|
||
$a = $temp;
|
||
}
|
||
return $a;
|
||
}
|
||
|
||
/**
|
||
* 计算竞争比(格式:1:比例,保留2位小数)
|
||
* @param int $zprs 招聘人数
|
||
* @param int $bkrs 审核通过人数
|
||
* @return string
|
||
*/
|
||
private function calculateCompetitionRatio(int $zprs, int $bkrs): string
|
||
{
|
||
if ($zprs <= 0) {
|
||
return '0:0';
|
||
}
|
||
|
||
if ($bkrs <= 0) {
|
||
return '1:0';
|
||
}
|
||
|
||
// 将招聘人数简化为1,计算审核通过人数与招聘人数的比例
|
||
$ratio = $bkrs / $zprs;
|
||
|
||
// 保留2位小数
|
||
$ratioFormatted = number_format($ratio, 2, '.', '');
|
||
|
||
// 如果小数部分是.00,则显示为整数
|
||
if (floatval($ratioFormatted) == intval($ratioFormatted)) {
|
||
return '1:' . intval($ratioFormatted);
|
||
}
|
||
|
||
return '1:' . $ratioFormatted;
|
||
}
|
||
|
||
/**
|
||
* 格式化职位信息(包含竞争比计算)
|
||
* @param array $item 原始职位数据
|
||
* @param string $zwdm 职位代码(备用)
|
||
* @return array
|
||
*/
|
||
public function formatPositionInfo(array $item, string $zwdm = ''): array
|
||
{
|
||
// 处理单条数据或数组数据
|
||
if (isset($item[0])) {
|
||
$item = $item[0];
|
||
}
|
||
|
||
// 获取招聘人数和审核通过人数
|
||
$zprs = isset($item['zprs']) ? intval($item['zprs']) : 0;
|
||
$bkrs = isset($item['bkrs']) ? intval($item['bkrs']) : 0;
|
||
|
||
// 计算竞争比(格式:1:比例,保留2位小数)
|
||
$competitionRatio = $this->calculateCompetitionRatio($zprs, $bkrs);
|
||
|
||
return [
|
||
'sbmc' => $item['sbmc'] ?? '', // 省份
|
||
'dsmc' => $item['dsmc'] ?? '', // 地区
|
||
'zpdwmc' => $item['zpdwmc'] ?? '', // 招聘单位/用人司局
|
||
'zwmc' => $item['zwmc'] ?? '', // 职位名称
|
||
'zwdm' => $item['zwdm'] ?? $zwdm, // 职位代码
|
||
'zprs' => $zprs, // 招聘人数
|
||
'bkrs' => $bkrs, // 审核通过人数
|
||
'competition_ratio' => $competitionRatio, // 竞争比(格式:1:比例,保留2位小数)
|
||
];
|
||
}
|
||
|
||
/**
|
||
* 构建Cookie字符串
|
||
* @param array|string $cookies Cookie数组或字符串
|
||
* @return string
|
||
*/
|
||
public function buildCookieString(array|string $cookies): string
|
||
{
|
||
// 如果直接传入原始Cookie字符串,优先使用
|
||
if (is_string($cookies)) {
|
||
return trim($cookies);
|
||
}
|
||
|
||
// 只保留用户填写的Cookie,支持同名键多值(如双JSESSIONID)
|
||
$cookieData = $cookies['请求 Cookie'] ?? $cookies;
|
||
$normalized = [];
|
||
|
||
foreach ($cookieData as $key => $value) {
|
||
$k = trim((string)$key);
|
||
if ($k === '') {
|
||
continue;
|
||
}
|
||
$vals = is_array($value) ? $value : [$value];
|
||
foreach ($vals as $v) {
|
||
$v = trim((string)$v);
|
||
if ($v === '') {
|
||
continue;
|
||
}
|
||
$normalized[$k][] = $v;
|
||
}
|
||
}
|
||
|
||
// 将 JSESSIONID 放在最前(如果存在),其余按键名顺序,多值全部保留
|
||
$parts = [];
|
||
if (isset($normalized['JSESSIONID'])) {
|
||
foreach ($normalized['JSESSIONID'] as $v) {
|
||
$parts[] = 'JSESSIONID=' . $v;
|
||
}
|
||
unset($normalized['JSESSIONID']);
|
||
}
|
||
foreach ($normalized as $k => $vArr) {
|
||
foreach ($vArr as $v) {
|
||
$parts[] = $k . '=' . $v;
|
||
}
|
||
}
|
||
|
||
return implode('; ', $parts);
|
||
}
|
||
|
||
/**
|
||
* 从HTML中提取地区选项
|
||
* @param string $html HTML内容
|
||
* @return array
|
||
*/
|
||
public function extractDsdmOptions(string $html): array
|
||
{
|
||
$options = [];
|
||
|
||
// 使用正则表达式提取option标签
|
||
preg_match_all('/<option\s+value="([^"]+)"[^>]*>([^<]+)<\/option>/i', $html, $matches);
|
||
|
||
if (!empty($matches[1])) {
|
||
foreach ($matches[1] as $index => $value) {
|
||
if (!empty($value)) { // 跳过"请选择"等空值
|
||
$options[] = [
|
||
'value' => $value,
|
||
'text' => $matches[2][$index] ?? $value,
|
||
];
|
||
}
|
||
}
|
||
}
|
||
|
||
return $options;
|
||
}
|
||
}
|