This commit is contained in:
杨志
2026-01-20 18:30:18 +08:00
parent 4f0c2112e7
commit c36f73caa2
2 changed files with 196 additions and 215 deletions

View File

@@ -39,29 +39,24 @@ class Crawler extends BaseController
]);
}
if (empty($cookiesParam)) {
$service = new CrawlerService();
try {
$cookies = $service->parseCookies($cookiesParam);
} catch (\Exception $e) {
return json([
'code' => 0,
'msg' => '请填写Cookie数据',
'msg' => $e->getMessage(),
]);
}
// 解析JSON格式的cookies
$cookies = is_string($cookiesParam) ? json_decode($cookiesParam, true) : $cookiesParam;
if (json_last_error() !== JSON_ERROR_NONE || empty($cookies)) {
return json([
'code' => 0,
'msg' => 'Cookie数据格式错误请检查JSON格式',
]);
}
$cookieString = $this->buildCookieString($cookies);
$cookieString = $service->buildCookieString($cookies);
// 构建URL获取HTML - 使用GET请求
$url = "http://gzrsks.oumakspt.com:62/tyzpwb/stuchooseexam/selectPosition.htm?examstupid=1015&userid={$userid}&bmid={$bmid}&examid={$examid}&aa={$aa}";
$baseUrl = $service->getBaseUrl();
$url = "{$baseUrl}/stuchooseexam/selectPosition.htm?examstupid=1015&userid={$userid}&bmid={$bmid}&examid={$examid}&aa={$aa}";
// 构建Referer URL与浏览器实际访问一致
$refererUrl = "http://gzrsks.oumakspt.com:62/tyzpwb/stuchooseexam/input.htm";
$refererUrl = "{$baseUrl}/stuchooseexam/input.htm";
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
@@ -72,17 +67,7 @@ class Crawler extends BaseController
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_ENCODING, ''); // 自动处理gzip编码
curl_setopt($ch, CURLOPT_HTTPHEADER, [
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language: zh-CN,zh;q=0.9',
'Accept-Encoding: gzip, deflate',
'Cache-Control: no-cache',
'Connection: keep-alive',
'Cookie: ' . $cookieString,
'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
'Referer: ' . $refererUrl,
'Upgrade-Insecure-Requests: 1',
]);
curl_setopt($ch, CURLOPT_HTTPHEADER, $service->buildHtmlHeaders($cookieString, $refererUrl));
$html = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
@@ -103,7 +88,6 @@ class Crawler extends BaseController
]);
}
$service = new CrawlerService();
$options = $service->extractDsdmOptions($html);
return json([
@@ -120,53 +104,6 @@ class Crawler extends BaseController
}
}
/**
* 构建Cookie字符串
* @param array $cookies Cookie数组
* @return string
*/
private function buildCookieString(array|string $cookies): string
{
// 如果直接传入原始Cookie字符串优先使用
if (is_string($cookies)) {
return trim($cookies);
}
// 只保留用户填写的Cookie支持同名键多值如双JSESSIONID
$cookieData = $cookies['请求 Cookie'] ?? $cookies;
$normalized = [];
foreach ($cookieData as $key => $value) {
$k = trim((string)$key);
if ($k === '') {
continue;
}
$vals = is_array($value) ? $value : [$value];
foreach ($vals as $v) {
$v = trim((string)$v);
if ($v === '') {
continue;
}
$normalized[$k][] = $v;
}
}
// 将 JSESSIONID 放在最前(如果存在),其余按键名顺序,多值全部保留
$parts = [];
if (isset($normalized['JSESSIONID'])) {
foreach ($normalized['JSESSIONID'] as $v) {
$parts[] = 'JSESSIONID=' . $v;
}
unset($normalized['JSESSIONID']);
}
foreach ($normalized as $k => $vArr) {
foreach ($vArr as $v) {
$parts[] = $k . '=' . $v;
}
}
return implode('; ', $parts);
}
/**
* 获取职位代码列表
@@ -202,23 +139,16 @@ class Crawler extends BaseController
]);
}
if (empty($cookiesParam)) {
return json([
'code' => 0,
'msg' => '请填写Cookie数据',
]);
}
// 解析JSON格式的cookies
$cookies = is_string($cookiesParam) ? json_decode($cookiesParam, true) : $cookiesParam;
if (json_last_error() !== JSON_ERROR_NONE || empty($cookies)) {
return json([
'code' => 0,
'msg' => 'Cookie数据格式错误请检查JSON格式',
]);
}
$service = new CrawlerService();
try {
$cookies = $service->parseCookies($cookiesParam);
} catch (\Exception $e) {
return json([
'code' => 0,
'msg' => $e->getMessage(),
]);
}
$treeData = $service->getPositionTree($dsdm, $examid, $bmid, $userid, (string)$aa, $cookies);
// 提取所有CODE作为zwdm
@@ -268,22 +198,16 @@ class Crawler extends BaseController
]);
}
if (empty($cookiesParam)) {
return json([
'code' => 0,
'msg' => '请填写Cookie数据',
]);
}
$cookies = is_string($cookiesParam) ? json_decode($cookiesParam, true) : $cookiesParam;
if (json_last_error() !== JSON_ERROR_NONE || empty($cookies)) {
return json([
'code' => 0,
'msg' => 'Cookie数据格式错误请检查JSON格式',
]);
}
$service = new CrawlerService();
try {
$cookies = $service->parseCookies($cookiesParam);
} catch (\Exception $e) {
return json([
'code' => 0,
'msg' => $e->getMessage(),
]);
}
$treeData = $service->getPositionTree($dsdm, $examid, $bmid, $userid, (string)$aa, $cookies);
$zwdmList = [];
@@ -339,48 +263,20 @@ class Crawler extends BaseController
]);
}
if (empty($cookiesParam)) {
return json([
'code' => 0,
'msg' => '请填写Cookie数据',
]);
}
// 解析JSON格式的cookies
$cookies = is_string($cookiesParam) ? json_decode($cookiesParam, true) : $cookiesParam;
if (json_last_error() !== JSON_ERROR_NONE || empty($cookies)) {
return json([
'code' => 0,
'msg' => 'Cookie数据格式错误请检查JSON格式',
]);
}
$service = new CrawlerService();
try {
$cookies = $service->parseCookies($cookiesParam);
} catch (\Exception $e) {
return json([
'code' => 0,
'msg' => $e->getMessage(),
]);
}
$info = $service->getPositionInfo($zwdm, $examid, $cookies);
if (!empty($info)) {
// 处理单条数据或数组数据
if (isset($info[0])) {
$item = $info[0];
} else {
$item = $info;
}
// 计算竞争比(格式:招聘人数:审核通过人数)
$zprs = isset($item['zprs']) ? intval($item['zprs']) : 0;
$bkrs = isset($item['bkrs']) ? intval($item['bkrs']) : 0;
$competitionRatio = $zprs > 0 && $bkrs > 0 ? $zprs . ':' . $bkrs : ($zprs > 0 ? $zprs . ':0' : '0:0');
$result = [
'sbmc' => $item['sbmc'] ?? '', // 省份
'dsmc' => $item['dsmc'] ?? '', // 地区
'zpdwmc' => $item['zpdwmc'] ?? '', // 招聘单位/用人司局
'zwmc' => $item['zwmc'] ?? '', // 职位名称
'zwdm' => $item['zwdm'] ?? $zwdm, // 职位代码
'zprs' => $zprs, // 招聘人数
'bkrs' => $bkrs, // 审核通过人数
'competition_ratio' => $competitionRatio, // 竞争比(格式:招聘人数:审核通过人数)
];
$result = $service->formatPositionInfo($info, $zwdm);
return json([
'code' => 1,
@@ -428,23 +324,16 @@ class Crawler extends BaseController
]);
}
if (empty($cookiesParam)) {
return json([
'code' => 0,
'msg' => '请填写Cookie数据',
]);
}
// 解析JSON格式的cookies
$cookies = is_string($cookiesParam) ? json_decode($cookiesParam, true) : $cookiesParam;
if (json_last_error() !== JSON_ERROR_NONE || empty($cookies)) {
return json([
'code' => 0,
'msg' => 'Cookie数据格式错误请检查JSON格式',
]);
}
$service = new CrawlerService();
try {
$cookies = $service->parseCookies($cookiesParam);
} catch (\Exception $e) {
return json([
'code' => 0,
'msg' => $e->getMessage(),
]);
}
$results = $service->batchGetPositionInfo($zwdmList, $examid, $cookies);
return json([

View File

@@ -10,9 +10,91 @@ namespace app\service;
class CrawlerService
{
/**
* 基础URL
* 基础URL(域名和端口)
*/
private $baseUrl = 'http://gzrsks.oumakspt.com:62/tyzpwb';
private const BASE_URL = 'http://gzrsks.oumakspt.com:62';
/**
* 应用路径
*/
private const APP_PATH = '/tyzpwb';
/**
* 获取完整基础URL包含应用路径
* @return string
*/
public function getBaseUrl(): string
{
return self::BASE_URL . self::APP_PATH;
}
/**
* 构建HTTP请求头用于HTML页面请求
* @param string $cookieString Cookie字符串
* @param string $referer Referer URL
* @return array
*/
public function buildHtmlHeaders(string $cookieString, string $referer): array
{
return [
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language: zh-CN,zh;q=0.9',
'Accept-Encoding: gzip, deflate',
'Cache-Control: no-cache',
'Connection: keep-alive',
'Cookie: ' . $cookieString,
'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
'Referer: ' . $referer,
'Upgrade-Insecure-Requests: 1',
];
}
/**
* 构建HTTP请求头用于AJAX请求
* @param string $cookieString Cookie字符串
* @param string $referer Referer URL
* @param string $browserType 浏览器类型:'chrome' 或 'firefox',默认为 'chrome'
* @param string|null $origin Origin URL可选Firefox时自动使用BASE_URL
* @param bool $withCharset 是否在Content-Type中包含charset=UTF-8默认为false
* @return array
*/
public function buildAjaxHeaders(string $cookieString, string $referer, string $browserType = 'chrome', ?string $origin = null, bool $withCharset = false): array
{
$isFirefox = $browserType === 'firefox';
$headers = [
'Accept: text/plain, */*',
'Accept-Language: zh-CN,zh;q=0.9,zh-TW;q=0.8,zh-HK;q=0.7,en-US;q=0.6,en;q=0.5',
'Accept-Encoding: gzip, deflate',
'Cache-Control: no-cache',
'Connection: keep-alive',
'Content-Type: application/x-www-form-urlencoded' . ($withCharset ? '; charset=UTF-8' : ''),
'Cookie: ' . $cookieString,
'Referer: ' . $referer,
'X-Requested-With: XMLHttpRequest',
];
// Firefox特有添加Pragma头
if ($isFirefox) {
$headers[] = 'Pragma: no-cache';
}
// User-Agent
if ($isFirefox) {
$headers[] = 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:147.0) Gecko/20100101 Firefox/147.0';
} else {
$headers[] = 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36';
}
// Origin
if ($isFirefox) {
$headers[] = 'Origin: ' . self::BASE_URL;
} elseif ($origin !== null) {
$headers[] = 'Origin: ' . $origin;
}
return $headers;
}
/**
* 获取职位树数据
@@ -26,12 +108,13 @@ class CrawlerService
*/
public function getPositionTree(string $dsdm, string $examid, string $bmid, string $userid, string $aa, array $cookies): array
{
$url = $this->baseUrl . '/tree/getPositionTree.htm';
$baseUrl = $this->getBaseUrl();
$url = $baseUrl . '/tree/getPositionTree.htm';
$cookieString = $this->buildCookieString($cookies);
// 构建Referer URL包含完整参数
$refererUrl = "http://gzrsks.oumakspt.com:62/tyzpwb/stuchooseexam/selectPosition.htm?examstupid=1015&userid={$userid}&bmid={$bmid}&examid={$examid}&aa={$aa}";
$refererUrl = $baseUrl . "/stuchooseexam/selectPosition.htm?examstupid=1015&userid={$userid}&bmid={$bmid}&examid={$examid}&aa={$aa}";
// 构建POST参数
$postData = [
@@ -48,20 +131,7 @@ class CrawlerService
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_ENCODING, ''); // 自动处理gzip编码
curl_setopt($ch, CURLOPT_HTTPHEADER, [
'Accept: text/plain, */*',
'Accept-Language: zh-CN,zh;q=0.9,zh-TW;q=0.8,zh-HK;q=0.7,en-US;q=0.6,en;q=0.5',
'Accept-Encoding: gzip, deflate',
'Pragma: no-cache',
'Cache-Control: no-cache',
'Connection: keep-alive',
'Content-Type: application/x-www-form-urlencoded',
'Cookie: ' . $cookieString,
'Origin: http://gzrsks.oumakspt.com:62',
'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:147.0) Gecko/20100101 Firefox/147.0',
'Referer: ' . $refererUrl,
'X-Requested-With: XMLHttpRequest',
]);
curl_setopt($ch, CURLOPT_HTTPHEADER, $this->buildAjaxHeaders($cookieString, $refererUrl, 'firefox'));
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($postData));
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
@@ -98,10 +168,14 @@ class CrawlerService
*/
public function getPositionInfo(string $zwdm, string $examid, array $cookies): array
{
$url = $this->baseUrl . '/stuchooseexam/getPositionInfo.htm';
$baseUrl = $this->getBaseUrl();
$url = $baseUrl . '/stuchooseexam/getPositionInfo.htm';
$cookieString = $this->buildCookieString($cookies);
$referer = $baseUrl . '/stuchooseexam/selectPosition.htm';
$origin = self::BASE_URL;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
@@ -109,19 +183,7 @@ class CrawlerService
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_ENCODING, ''); // 自动处理gzip编码
curl_setopt($ch, CURLOPT_HTTPHEADER, [
'Accept: text/plain, */*',
'Accept-Language: zh-CN,zh;q=0.9,zh-TW;q=0.8,zh-HK;q=0.7,en-US;q=0.6,en;q=0.5',
'Accept-Encoding: gzip, deflate',
'Cache-Control: no-cache',
'Connection: keep-alive',
'Content-Type: application/x-www-form-urlencoded; charset=UTF-8',
'Cookie: ' . $cookieString,
'Origin: http://gzrsks.oumakspt.com:62',
'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
'Referer: http://gzrsks.oumakspt.com:62/tyzpwb/stuchooseexam/selectPosition.htm',
'X-Requested-With: XMLHttpRequest',
]);
curl_setopt($ch, CURLOPT_HTTPHEADER, $this->buildAjaxHeaders($cookieString, $referer, 'chrome', $origin, true));
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query([
'zwdm' => $zwdm,
@@ -168,28 +230,7 @@ class CrawlerService
$info = $this->getPositionInfo($zwdm, $examid, $cookies);
if (!empty($info)) {
// 处理单条数据或数组数据
if (isset($info[0])) {
$item = $info[0];
} else {
$item = $info;
}
// 计算竞争比(格式:招聘人数:审核通过人数)
$zprs = isset($item['zprs']) ? intval($item['zprs']) : 0;
$bkrs = isset($item['bkrs']) ? intval($item['bkrs']) : 0;
$competitionRatio = $zprs > 0 && $bkrs > 0 ? $zprs . ':' . $bkrs : ($zprs > 0 ? $zprs . ':0' : '0:0');
$results[] = [
'sbmc' => $item['sbmc'] ?? '', // 省份
'dsmc' => $item['dsmc'] ?? '', // 地区
'zpdwmc' => $item['zpdwmc'] ?? '', // 招聘单位/用人司局
'zwmc' => $item['zwmc'] ?? '', // 职位名称
'zwdm' => $item['zwdm'] ?? $zwdm, // 职位代码
'zprs' => $zprs, // 招聘人数
'bkrs' => $bkrs, // 审核通过人数
'competition_ratio' => $competitionRatio, // 竞争比(格式:招聘人数:审核通过人数)
];
$results[] = $this->formatPositionInfo($info, $zwdm);
}
// 避免请求过快,添加小延迟
@@ -207,12 +248,63 @@ class CrawlerService
return $results;
}
/**
* 解析并验证Cookie数据
* @param mixed $cookiesParam Cookie参数可能是字符串或数组
* @return array
* @throws \Exception
*/
public function parseCookies($cookiesParam): array
{
if (empty($cookiesParam)) {
throw new \Exception('请填写Cookie数据');
}
// 解析JSON格式的cookies
$cookies = is_string($cookiesParam) ? json_decode($cookiesParam, true) : $cookiesParam;
if (json_last_error() !== JSON_ERROR_NONE || empty($cookies)) {
throw new \Exception('Cookie数据格式错误请检查JSON格式');
}
return $cookies;
}
/**
* 格式化职位信息(包含竞争比计算)
* @param array $item 原始职位数据
* @param string $zwdm 职位代码(备用)
* @return array
*/
public function formatPositionInfo(array $item, string $zwdm = ''): array
{
// 处理单条数据或数组数据
if (isset($item[0])) {
$item = $item[0];
}
// 计算竞争比(格式:招聘人数:审核通过人数)
$zprs = isset($item['zprs']) ? intval($item['zprs']) : 0;
$bkrs = isset($item['bkrs']) ? intval($item['bkrs']) : 0;
$competitionRatio = $zprs > 0 && $bkrs > 0 ? $zprs . ':' . $bkrs : ($zprs > 0 ? $zprs . ':0' : '0:0');
return [
'sbmc' => $item['sbmc'] ?? '', // 省份
'dsmc' => $item['dsmc'] ?? '', // 地区
'zpdwmc' => $item['zpdwmc'] ?? '', // 招聘单位/用人司局
'zwmc' => $item['zwmc'] ?? '', // 职位名称
'zwdm' => $item['zwdm'] ?? $zwdm, // 职位代码
'zprs' => $zprs, // 招聘人数
'bkrs' => $bkrs, // 审核通过人数
'competition_ratio' => $competitionRatio, // 竞争比(格式:招聘人数:审核通过人数)
];
}
/**
* 构建Cookie字符串
* @param array $cookies Cookie数组
* @param array|string $cookies Cookie数组或字符串
* @return string
*/
private function buildCookieString(array|string $cookies): string
public function buildCookieString(array|string $cookies): string
{
// 如果直接传入原始Cookie字符串优先使用
if (is_string($cookies)) {