51 lines
1.4 KiB
Python
51 lines
1.4 KiB
Python
import re
|
|
from typing import Any, Dict, List
|
|
|
|
|
|
USERNAME_PATTERN = re.compile(r"@([A-Za-z0-9_]{3,})")
|
|
TME_PATTERN = re.compile(r"(?:https?://)?t\.me/([A-Za-z0-9_]{3,})")
|
|
ID_PATTERN = re.compile(r"`(\d{4,})`")
|
|
|
|
|
|
def extract_entities(text: str) -> List[Dict[str, Any]]:
|
|
if not text:
|
|
return []
|
|
|
|
entities: List[Dict[str, Any]] = []
|
|
seen = set()
|
|
|
|
for user_id in set(ID_PATTERN.findall(text)):
|
|
key = ("user_id", user_id)
|
|
if key not in seen:
|
|
seen.add(key)
|
|
entities.append({
|
|
"type": "user_id",
|
|
"value": user_id
|
|
})
|
|
|
|
for username in set(USERNAME_PATTERN.findall(text)):
|
|
normalized = username.strip()
|
|
if not normalized:
|
|
continue
|
|
key = ("username", normalized.lower())
|
|
if key not in seen:
|
|
seen.add(key)
|
|
entities.append({
|
|
"type": "username",
|
|
"value": normalized.lower(),
|
|
"display": normalized
|
|
})
|
|
|
|
for link in set(TME_PATTERN.findall(text)):
|
|
normalized = f"t.me/{link}"
|
|
key = ("tme_link", normalized.lower())
|
|
if key not in seen:
|
|
seen.add(key)
|
|
entities.append({
|
|
"type": "tme_link",
|
|
"value": normalized.lower(),
|
|
"display": normalized
|
|
})
|
|
|
|
return entities
|