From e39126d54ca0f0720605490598cd4e76b9aae51e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=A8=E5=BF=97?= Date: Mon, 2 Feb 2026 14:53:38 +0800 Subject: [PATCH] up --- app/controller/Crawler.php | 7 +++--- app/service/CrawlerService.php | 44 +++++++++++++++++++++++++--------- 2 files changed, 36 insertions(+), 15 deletions(-) diff --git a/app/controller/Crawler.php b/app/controller/Crawler.php index ba34d08..5edfbc5 100644 --- a/app/controller/Crawler.php +++ b/app/controller/Crawler.php @@ -344,12 +344,11 @@ class Crawler extends BaseController if (is_array($treeData)) { foreach ($treeData as $item) { if (isset($item['CODE']) && !empty($item['CODE'])) { - $code = $item['CODE']; - // 跳过152开头的职位代码 - if (strpos($code, '152') === 0) { + // 跳过带 nocheck: true 的项(如父级/分组节点,不可选) + if (!empty($item['nocheck'])) { continue; } - $zwdmList[] = $code; + $zwdmList[] = $item['CODE']; } } } diff --git a/app/service/CrawlerService.php b/app/service/CrawlerService.php index 528ebfd..f0e4424 100644 --- a/app/service/CrawlerService.php +++ b/app/service/CrawlerService.php @@ -400,28 +400,50 @@ class CrawlerService } /** - * 从HTML中提取地区选项 + * 从HTML中提取地区选项(仅从地市/地区相关的 select 中提取,并过滤占位项与乱码) * @param string $html HTML内容 * @return array */ public function extractDsdmOptions(string $html): array { $options = []; - - // 使用正则表达式提取option标签 - preg_match_all('/]*>([^<]+)<\/option>/i', $html, $matches); - + $placeholderValues = ['0', '-1', '']; + $placeholderTexts = ['请选择', '请选择地区', '请选择地市', '全部', '']; + + // 先尝试只匹配「地市」相关 select 内的 option,减少误匹配 + if (preg_match('/]*地市[^>]*>.*?<\/select>/is', $html, $selectBlock)) { + $html = $selectBlock[0]; + } + + preg_match_all('/]*>([^<]*)<\/option>/i', $html, $matches); + if (!empty($matches[1])) { foreach ($matches[1] as $index => $value) { - if (!empty($value)) { // 跳过"请选择"等空值 - $options[] = [ - 'value' => $value, - 'text' => $matches[2][$index] ?? $value, - ]; + $value = trim($value); + $text = trim($matches[2][$index] ?? $value); + + if (in_array($value, $placeholderValues, true)) { + continue; } + if ($text === '' || in_array($text, $placeholderTexts, true)) { + continue; + } + // 过滤乱码:含控制字符或非 UTF-8 的视为无效 + if (preg_match('/[\x00-\x08\x0B\x0C\x0E-\x1F]/', $text) || !mb_check_encoding($text, 'UTF-8')) { + continue; + } + // 过滤明显非地区名称的短乱码(如单字符乱码) + if (mb_strlen($text) < 2 && !preg_match('/^[\x{4e00}-\x{9fa5}A-Za-z0-9]+$/u', $text)) { + continue; + } + + $options[] = [ + 'value' => $value, + 'text' => $text, + ]; } } - + return $options; } }