初始版本发布

This commit is contained in:
杨志
2026-01-20 14:12:19 +08:00
parent 6e0c3ae1a0
commit 871aefc33e
4 changed files with 1051 additions and 0 deletions

285
app/controller/Crawler.php Normal file
View File

@@ -0,0 +1,285 @@
<?php
declare (strict_types = 1);
namespace app\controller;
use app\BaseController;
use app\service\CrawlerService;
use think\facade\View;
/**
* 爬虫控制器
*/
class Crawler extends BaseController
{
/**
* 显示爬虫工具首页
*/
public function index()
{
return View::fetch();
}
/**
* 获取地区选项从网页HTML中提取
*/
public function getDsdmOptions()
{
try {
$examid = $this->request->param('examid', '');
$bmid = $this->request->param('bmid', '');
$userid = $this->request->param('userid', '');
if (empty($examid) || empty($bmid) || empty($userid)) {
return json([
'code' => 0,
'msg' => '请先填写examid、bmid和userid',
]);
}
// 构建URL获取HTML
$url = "http://gzrsks.oumakspt.com:62/tyzpwb/stuchooseexam/selectPosition.htm?examstupid=1015&userid={$userid}&bmid={$bmid}&examid={$examid}&aa=" . time() . '000';
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
$html = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$error = curl_error($ch);
curl_close($ch);
if ($error) {
return json([
'code' => 0,
'msg' => '获取网页失败: ' . $error,
]);
}
if ($httpCode !== 200) {
return json([
'code' => 0,
'msg' => '获取网页失败HTTP状态码: ' . $httpCode,
]);
}
$service = new CrawlerService();
$options = $service->extractDsdmOptions($html);
return json([
'code' => 1,
'data' => $options,
'msg' => '获取成功',
]);
} catch (\Exception $e) {
return json([
'code' => 0,
'msg' => '获取失败: ' . $e->getMessage(),
]);
}
}
/**
* 获取职位代码列表
*/
public function getZwdmList()
{
try {
$dsdm = $this->request->param('dsdm', '');
$cookiesParam = $this->request->param('cookies', '');
if (empty($dsdm)) {
return json([
'code' => 0,
'msg' => '请选择地区',
]);
}
if (empty($cookiesParam)) {
return json([
'code' => 0,
'msg' => '请填写Cookie数据',
]);
}
// 解析JSON格式的cookies
$cookies = is_string($cookiesParam) ? json_decode($cookiesParam, true) : $cookiesParam;
if (json_last_error() !== JSON_ERROR_NONE || empty($cookies)) {
return json([
'code' => 0,
'msg' => 'Cookie数据格式错误请检查JSON格式',
]);
}
$service = new CrawlerService();
$treeData = $service->getPositionTree($dsdm, $cookies);
// 提取所有CODE作为zwdm
$zwdmList = [];
if (is_array($treeData)) {
foreach ($treeData as $item) {
if (isset($item['CODE']) && !empty($item['CODE'])) {
$zwdmList[] = [
'zwdm' => $item['CODE'],
'title' => $item['TITLE'] ?? $item['CODE'],
];
}
}
}
return json([
'code' => 1,
'data' => $zwdmList,
'msg' => '获取成功',
]);
} catch (\Exception $e) {
return json([
'code' => 0,
'msg' => '获取失败: ' . $e->getMessage(),
]);
}
}
/**
* 获取职位详细信息
*/
public function getPositionInfo()
{
try {
$zwdm = $this->request->param('zwdm', '');
$examid = $this->request->param('examid', '');
$cookiesParam = $this->request->param('cookies', '');
if (empty($zwdm) || empty($examid)) {
return json([
'code' => 0,
'msg' => '参数不完整',
]);
}
if (empty($cookiesParam)) {
return json([
'code' => 0,
'msg' => '请填写Cookie数据',
]);
}
// 解析JSON格式的cookies
$cookies = is_string($cookiesParam) ? json_decode($cookiesParam, true) : $cookiesParam;
if (json_last_error() !== JSON_ERROR_NONE || empty($cookies)) {
return json([
'code' => 0,
'msg' => 'Cookie数据格式错误请检查JSON格式',
]);
}
$service = new CrawlerService();
$info = $service->getPositionInfo($zwdm, $examid, $cookies);
if (!empty($info)) {
// 处理单条数据或数组数据
if (isset($info[0])) {
$item = $info[0];
} else {
$item = $info;
}
// 计算竞争比
$zprs = isset($item['zprs']) ? intval($item['zprs']) : 0;
$bkrs = isset($item['bkrs']) ? intval($item['bkrs']) : 0;
$competitionRatio = $zprs > 0 ? ($bkrs / $zprs) : 0;
$result = [
'sbmc' => $item['sbmc'] ?? '', // 省份
'dsmc' => $item['dsmc'] ?? '', // 地区
'zpdwmc' => $item['zpdwmc'] ?? '', // 招聘单位/用人司局
'zwmc' => $item['zwmc'] ?? '', // 职位名称
'zwdm' => $item['zwdm'] ?? $zwdm, // 职位代码
'zprs' => $zprs, // 招聘人数
'bkrs' => $bkrs, // 审核通过人数
'competition_ratio' => number_format($competitionRatio, 2), // 竞争比
];
return json([
'code' => 1,
'data' => $result,
'msg' => '获取成功',
]);
} else {
return json([
'code' => 0,
'msg' => '未获取到数据',
]);
}
} catch (\Exception $e) {
return json([
'code' => 0,
'msg' => '获取失败: ' . $e->getMessage(),
]);
}
}
/**
* 批量获取职位信息
*/
public function batchGetPositionInfo()
{
try {
$zwdmListParam = $this->request->param('zwdm_list', '');
$examid = $this->request->param('examid', '');
$cookiesParam = $this->request->param('cookies', '');
// 解析JSON格式的zwdm_list
$zwdmList = is_string($zwdmListParam) ? json_decode($zwdmListParam, true) : $zwdmListParam;
if (json_last_error() !== JSON_ERROR_NONE || empty($zwdmList) || !is_array($zwdmList)) {
return json([
'code' => 0,
'msg' => '请选择职位代码',
]);
}
if (empty($examid)) {
return json([
'code' => 0,
'msg' => '请填写examid',
]);
}
if (empty($cookiesParam)) {
return json([
'code' => 0,
'msg' => '请填写Cookie数据',
]);
}
// 解析JSON格式的cookies
$cookies = is_string($cookiesParam) ? json_decode($cookiesParam, true) : $cookiesParam;
if (json_last_error() !== JSON_ERROR_NONE || empty($cookies)) {
return json([
'code' => 0,
'msg' => 'Cookie数据格式错误请检查JSON格式',
]);
}
$service = new CrawlerService();
$results = $service->batchGetPositionInfo($zwdmList, $examid, $cookies);
return json([
'code' => 1,
'data' => $results,
'msg' => '获取成功',
]);
} catch (\Exception $e) {
return json([
'code' => 0,
'msg' => '获取失败: ' . $e->getMessage(),
]);
}
}
}

View File

@@ -0,0 +1,216 @@
<?php
declare (strict_types = 1);
namespace app\service;
/**
* 爬虫服务类
* 用于处理职位信息爬取相关逻辑
*/
class CrawlerService
{
/**
* 基础URL
*/
private $baseUrl = 'http://gzrsks.oumakspt.com:62/tyzpwb';
/**
* 获取职位树数据
* @param string $dsdm 地区代码
* @param array $cookies Cookie数据
* @return array
*/
public function getPositionTree(string $dsdm, array $cookies): array
{
$url = $this->baseUrl . '/tree/getPositionTree.htm';
$cookieString = $this->buildCookieString($cookies);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, [
'Cookie: ' . $cookieString,
'Content-Type: application/x-www-form-urlencoded',
]);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query(['dsdm' => $dsdm]));
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$error = curl_error($ch);
curl_close($ch);
if ($error) {
throw new \Exception('请求失败: ' . $error);
}
if ($httpCode !== 200) {
throw new \Exception('请求失败HTTP状态码: ' . $httpCode);
}
$data = json_decode($response, true);
if (json_last_error() !== JSON_ERROR_NONE) {
throw new \Exception('JSON解析失败: ' . json_last_error_msg());
}
return $data ?: [];
}
/**
* 获取职位详细信息
* @param string $zwdm 职位代码
* @param string $examid 考试ID
* @param array $cookies Cookie数据
* @return array
*/
public function getPositionInfo(string $zwdm, string $examid, array $cookies): array
{
$url = $this->baseUrl . '/stuchooseexam/getPositionInfo.htm';
$cookieString = $this->buildCookieString($cookies);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, [
'Cookie: ' . $cookieString,
'Content-Type: application/x-www-form-urlencoded',
]);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query([
'zwdm' => $zwdm,
'examid' => $examid
]));
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$error = curl_error($ch);
curl_close($ch);
if ($error) {
throw new \Exception('请求失败: ' . $error);
}
if ($httpCode !== 200) {
throw new \Exception('请求失败HTTP状态码: ' . $httpCode);
}
$data = json_decode($response, true);
if (json_last_error() !== JSON_ERROR_NONE) {
throw new \Exception('JSON解析失败: ' . json_last_error_msg());
}
return $data ?: [];
}
/**
* 批量获取职位信息
* @param array $zwdmList 职位代码列表
* @param string $examid 考试ID
* @param array $cookies Cookie数据
* @return array
*/
public function batchGetPositionInfo(array $zwdmList, string $examid, array $cookies): array
{
$results = [];
foreach ($zwdmList as $zwdm) {
try {
$info = $this->getPositionInfo($zwdm, $examid, $cookies);
if (!empty($info)) {
// 处理单条数据或数组数据
if (isset($info[0])) {
$item = $info[0];
} else {
$item = $info;
}
// 计算竞争比
$zprs = isset($item['zprs']) ? intval($item['zprs']) : 0;
$bkrs = isset($item['bkrs']) ? intval($item['bkrs']) : 0;
$competitionRatio = $zprs > 0 ? ($bkrs / $zprs) : 0;
$results[] = [
'sbmc' => $item['sbmc'] ?? '', // 省份
'dsmc' => $item['dsmc'] ?? '', // 地区
'zpdwmc' => $item['zpdwmc'] ?? '', // 招聘单位/用人司局
'zwmc' => $item['zwmc'] ?? '', // 职位名称
'zwdm' => $item['zwdm'] ?? $zwdm, // 职位代码
'zprs' => $zprs, // 招聘人数
'bkrs' => $bkrs, // 审核通过人数
'competition_ratio' => number_format($competitionRatio, 2), // 竞争比
];
}
// 避免请求过快,添加小延迟
usleep(200000); // 0.2秒
} catch (\Exception $e) {
// 记录错误但继续处理其他数据
$results[] = [
'zwdm' => $zwdm,
'error' => $e->getMessage(),
];
}
}
return $results;
}
/**
* 构建Cookie字符串
* @param array $cookies Cookie数组
* @return string
*/
private function buildCookieString(array $cookies): string
{
$cookieArray = [];
// 处理嵌套的Cookie结构
if (isset($cookies['请求 Cookie'])) {
$cookieData = $cookies['请求 Cookie'];
} else {
$cookieData = $cookies;
}
foreach ($cookieData as $key => $value) {
$cookieArray[] = $key . '=' . $value;
}
return implode('; ', $cookieArray);
}
/**
* 从HTML中提取地区选项
* @param string $html HTML内容
* @return array
*/
public function extractDsdmOptions(string $html): array
{
$options = [];
// 使用正则表达式提取option标签
preg_match_all('/<option\s+value="([^"]+)"[^>]*>([^<]+)<\/option>/i', $html, $matches);
if (!empty($matches[1])) {
foreach ($matches[1] as $index => $value) {
if (!empty($value)) { // 跳过"请选择"等空值
$options[] = [
'value' => $value,
'text' => $matches[2][$index] ?? $value,
];
}
}
}
return $options;
}
}