up
This commit is contained in:
@@ -344,12 +344,11 @@ class Crawler extends BaseController
|
||||
if (is_array($treeData)) {
|
||||
foreach ($treeData as $item) {
|
||||
if (isset($item['CODE']) && !empty($item['CODE'])) {
|
||||
$code = $item['CODE'];
|
||||
// 跳过152开头的职位代码
|
||||
if (strpos($code, '152') === 0) {
|
||||
// 跳过带 nocheck: true 的项(如父级/分组节点,不可选)
|
||||
if (!empty($item['nocheck'])) {
|
||||
continue;
|
||||
}
|
||||
$zwdmList[] = $code;
|
||||
$zwdmList[] = $item['CODE'];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -400,25 +400,47 @@ class CrawlerService
|
||||
}
|
||||
|
||||
/**
|
||||
* 从HTML中提取地区选项
|
||||
* 从HTML中提取地区选项(仅从地市/地区相关的 select 中提取,并过滤占位项与乱码)
|
||||
* @param string $html HTML内容
|
||||
* @return array
|
||||
*/
|
||||
public function extractDsdmOptions(string $html): array
|
||||
{
|
||||
$options = [];
|
||||
$placeholderValues = ['0', '-1', ''];
|
||||
$placeholderTexts = ['请选择', '请选择地区', '请选择地市', '全部', ''];
|
||||
|
||||
// 使用正则表达式提取option标签
|
||||
preg_match_all('/<option\s+value="([^"]+)"[^>]*>([^<]+)<\/option>/i', $html, $matches);
|
||||
// 先尝试只匹配「地市」相关 select 内的 option,减少误匹配
|
||||
if (preg_match('/<select[^>]*地市[^>]*>.*?<\/select>/is', $html, $selectBlock)) {
|
||||
$html = $selectBlock[0];
|
||||
}
|
||||
|
||||
preg_match_all('/<option\s+value="([^"]*)"[^>]*>([^<]*)<\/option>/i', $html, $matches);
|
||||
|
||||
if (!empty($matches[1])) {
|
||||
foreach ($matches[1] as $index => $value) {
|
||||
if (!empty($value)) { // 跳过"请选择"等空值
|
||||
$options[] = [
|
||||
'value' => $value,
|
||||
'text' => $matches[2][$index] ?? $value,
|
||||
];
|
||||
$value = trim($value);
|
||||
$text = trim($matches[2][$index] ?? $value);
|
||||
|
||||
if (in_array($value, $placeholderValues, true)) {
|
||||
continue;
|
||||
}
|
||||
if ($text === '' || in_array($text, $placeholderTexts, true)) {
|
||||
continue;
|
||||
}
|
||||
// 过滤乱码:含控制字符或非 UTF-8 的视为无效
|
||||
if (preg_match('/[\x00-\x08\x0B\x0C\x0E-\x1F]/', $text) || !mb_check_encoding($text, 'UTF-8')) {
|
||||
continue;
|
||||
}
|
||||
// 过滤明显非地区名称的短乱码(如单字符乱码)
|
||||
if (mb_strlen($text) < 2 && !preg_match('/^[\x{4e00}-\x{9fa5}A-Za-z0-9]+$/u', $text)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$options[] = [
|
||||
'value' => $value,
|
||||
'text' => $text,
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user