Files
shengkao_pachong/app/service/CrawlerService.php
2026-01-20 16:31:05 +08:00

260 lines
8.9 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
declare (strict_types = 1);
namespace app\service;
/**
* 爬虫服务类
* 用于处理职位信息爬取相关逻辑
*/
class CrawlerService
{
/**
* 基础URL
*/
private $baseUrl = 'http://gzrsks.oumakspt.com:62/tyzpwb';
/**
* 获取职位树数据
* @param string $dsdm 地区代码
* @param array $cookies Cookie数据
* @return array
*/
public function getPositionTree(string $dsdm, array $cookies): array
{
$url = $this->baseUrl . '/tree/getPositionTree.htm';
$cookieString = $this->buildCookieString($cookies);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_HTTPHEADER, [
'Accept: application/json, text/javascript, */*; q=0.01',
'Accept-Language: zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding: gzip, deflate',
'Cache-Control: no-cache',
'Connection: keep-alive',
'Content-Type: application/x-www-form-urlencoded',
'Cookie: ' . $cookieString,
'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'Referer: http://gzrsks.oumakspt.com:62/tyzpwb/',
'X-Requested-With: XMLHttpRequest',
]);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query(['dsdm' => $dsdm]));
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$error = curl_error($ch);
curl_close($ch);
if ($error) {
throw new \Exception('请求失败: ' . $error);
}
if ($httpCode !== 200) {
throw new \Exception('请求失败HTTP状态码: ' . $httpCode);
}
$data = json_decode($response, true);
if (json_last_error() !== JSON_ERROR_NONE) {
throw new \Exception('JSON解析失败: ' . json_last_error_msg());
}
return $data ?: [];
}
/**
* 获取职位详细信息
* @param string $zwdm 职位代码
* @param string $examid 考试ID
* @param array $cookies Cookie数据
* @return array
*/
public function getPositionInfo(string $zwdm, string $examid, array $cookies): array
{
$url = $this->baseUrl . '/stuchooseexam/getPositionInfo.htm';
$cookieString = $this->buildCookieString($cookies);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_HTTPHEADER, [
'Accept: application/json, text/javascript, */*; q=0.01',
'Accept-Language: zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding: gzip, deflate',
'Cache-Control: no-cache',
'Connection: keep-alive',
'Content-Type: application/x-www-form-urlencoded',
'Cookie: ' . $cookieString,
'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'Referer: http://gzrsks.oumakspt.com:62/tyzpwb/',
'X-Requested-With: XMLHttpRequest',
]);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query([
'zwdm' => $zwdm,
'examid' => $examid
]));
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$error = curl_error($ch);
curl_close($ch);
if ($error) {
throw new \Exception('请求失败: ' . $error);
}
if ($httpCode !== 200) {
throw new \Exception('请求失败HTTP状态码: ' . $httpCode);
}
$data = json_decode($response, true);
if (json_last_error() !== JSON_ERROR_NONE) {
throw new \Exception('JSON解析失败: ' . json_last_error_msg());
}
return $data ?: [];
}
/**
* 批量获取职位信息
* @param array $zwdmList 职位代码列表
* @param string $examid 考试ID
* @param array $cookies Cookie数据
* @return array
*/
public function batchGetPositionInfo(array $zwdmList, string $examid, array $cookies): array
{
$results = [];
foreach ($zwdmList as $zwdm) {
try {
$info = $this->getPositionInfo($zwdm, $examid, $cookies);
if (!empty($info)) {
// 处理单条数据或数组数据
if (isset($info[0])) {
$item = $info[0];
} else {
$item = $info;
}
// 计算竞争比
$zprs = isset($item['zprs']) ? intval($item['zprs']) : 0;
$bkrs = isset($item['bkrs']) ? intval($item['bkrs']) : 0;
$competitionRatio = $zprs > 0 ? ($bkrs / $zprs) : 0;
$results[] = [
'sbmc' => $item['sbmc'] ?? '', // 省份
'dsmc' => $item['dsmc'] ?? '', // 地区
'zpdwmc' => $item['zpdwmc'] ?? '', // 招聘单位/用人司局
'zwmc' => $item['zwmc'] ?? '', // 职位名称
'zwdm' => $item['zwdm'] ?? $zwdm, // 职位代码
'zprs' => $zprs, // 招聘人数
'bkrs' => $bkrs, // 审核通过人数
'competition_ratio' => number_format($competitionRatio, 2), // 竞争比
];
}
// 避免请求过快,添加小延迟
usleep(200000); // 0.2秒
} catch (\Exception $e) {
// 记录错误但继续处理其他数据
$results[] = [
'zwdm' => $zwdm,
'error' => $e->getMessage(),
];
}
}
return $results;
}
/**
* 构建Cookie字符串
* @param array $cookies Cookie数组
* @return string
*/
private function buildCookieString(array|string $cookies): string
{
// 如果直接传入原始Cookie字符串优先使用
if (is_string($cookies)) {
return trim($cookies);
}
// 只保留用户填写的Cookie若同名键多值则取最后一个单JSESSIONID场景
$cookieData = $cookies['请求 Cookie'] ?? $cookies;
$normalized = [];
foreach ($cookieData as $key => $value) {
$k = trim((string)$key);
if ($k === '') {
continue;
}
$v = is_array($value) ? end($value) : $value;
$v = trim((string)$v);
if ($v === '') {
continue;
}
$normalized[$k] = $v;
}
// 将 JSESSIONID 放在最前(如果存在),其余按键名顺序
$parts = [];
if (isset($normalized['JSESSIONID'])) {
// 按需求:输出两个相同的 JSESSIONID
$parts[] = 'JSESSIONID=' . $normalized['JSESSIONID'];
$parts[] = 'JSESSIONID=' . $normalized['JSESSIONID'];
unset($normalized['JSESSIONID']);
}
foreach ($normalized as $k => $v) {
$parts[] = $k . '=' . $v;
}
return implode('; ', $parts);
}
/**
* 从HTML中提取地区选项
* @param string $html HTML内容
* @return array
*/
public function extractDsdmOptions(string $html): array
{
$options = [];
// 使用正则表达式提取option标签
preg_match_all('/<option\s+value="([^"]+)"[^>]*>([^<]+)<\/option>/i', $html, $matches);
if (!empty($matches[1])) {
foreach ($matches[1] as $index => $value) {
if (!empty($value)) { // 跳过"请选择"等空值
$options[] = [
'value' => $value,
'text' => $matches[2][$index] ?? $value,
];
}
}
}
return $options;
}
}