fix: Improve skill scanner parser for 3-column layout

The previous parser was too simple and couldn't handle the merged text
from OCR on the skills window.

IMPROVEMENTS:
1. Clean up common headers and category names from OCR text
2. Better regex pattern that handles merged text
3. Alternative parser as fallback for heavily merged text
4. Debug logging to show parsed skills
5. Validation to filter out bad matches

PARSING LOGIC:
- Finds pattern: SkillName Rank Points
- Handles multi-word skill names (e.g., 'Combat Reflexes')
- Recognizes all EU skill ranks (Newbie through Awesome)
- Validates points are reasonable numbers

This should correctly parse skills like:
  Aim Amazing 5524
  Combat Reflexes Incredible 5991
  Handgun Grand Master 8621
This commit is contained in:
LemonNexus 2026-02-15 00:05:05 +00:00
parent 8e49f4e45e
commit a30bcbaba7
1 changed files with 74 additions and 8 deletions

View File

@ -51,20 +51,86 @@ class SkillOCRThread(QThread):
self.scan_error.emit(str(e))
def _parse_skills(self, text):
"""Parse skill data from OCR text."""
"""Parse skill data from OCR text with improved handling for 3-column layout."""
skills = {}
# Ranks in Entropia Universe (in order)
RANKS = [
'Newbie', 'Inept', 'Beginner', 'Amateur', 'Average',
'Skilled', 'Expert', 'Professional', 'Master', 'Grand Master',
'Champion', 'Legendary', 'Guru', 'Astonishing', 'Remarkable',
'Outstanding', 'Marvelous', 'Prodigious', 'Amazing', 'Incredible', 'Awesome'
]
rank_pattern = '|'.join(RANKS)
# Clean up the text - remove common headers and junk
text = text.replace('SKILLS', '').replace('ALL CATEGORIES', '')
text = text.replace('SKILL NAME', '').replace('RANK', '').replace('POINTS', '')
text = text.replace('Attributes', '').replace('COMBAT', '').replace('Design', '')
text = text.replace('Construction', '').replace('Defense', '').replace('General', '')
text = text.replace('Handgun', '').replace('Heavy Melee Weapons', '')
text = text.replace('Information', '').replace('Inflict Melee Damage', '')
text = text.replace('Inflict Ranged Damage', '').replace('Light Melee Weapons', '')
text = text.replace('Longblades', '').replace('Medical', '').replace('Mining', '')
text = text.replace('Science', '').replace('Social', '').replace('Beauty', '')
text = text.replace('Mindforce', '')
lines = text.split('\n')
for line in lines:
# Pattern: SkillName Rank Points
line = line.strip()
if not line:
continue
# Skip category headers and short lines
if len(line) < 10:
continue
# Try pattern: SkillName Rank Points
# More flexible pattern to handle merged text
match = re.search(
r'(\w+(?:\s+\w+)*)\s+(Newbie|Inept|Beginner|Amateur|Average|Skilled|Expert|Professional|Master|Grand Master|Champion|Legendary|Guru|Astonishing|Remarkable|Outstanding|Marvelous|Prodigious|Amazing|Incredible|Awesome)\s+(\d+)',
rf'([A-Za-z][A-Za-z\s]{{2,50}}?)\s+({rank_pattern})\s+(\d{{1,6}})(?:\s|$)',
line, re.IGNORECASE
)
if match:
skill_name = match.group(1).strip()
rank = match.group(2)
points = int(match.group(3))
# Clean up skill name
skill_name = skill_name.strip()
# Validate - points should be reasonable (not too small)
if points > 0:
skills[skill_name] = {
'rank': rank,
'points': points,
'scanned_at': datetime.now().isoformat()
}
print(f"[SkillScanner] Parsed: {skill_name} = {rank} ({points})")
# Alternative parsing: try to find skill-rank-points triplets
if not skills:
skills = self._parse_skills_alternative(text, RANKS)
return skills
def _parse_skills_alternative(self, text, ranks):
"""Alternative parser for when text is heavily merged."""
skills = {}
# Find all rank positions in the text
for rank in ranks:
# Look for pattern: [text] [Rank] [number]
pattern = rf'([A-Z][a-z]{{2,}}(?:\s+[A-Z][a-z]{{2,}}){{0,3}})\s+{rank}\s+(\d{{1,6}})'
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
skill_name = match.group(1).strip()
points = int(match.group(2))
if points > 0 and len(skill_name) > 2:
skills[skill_name] = {
'rank': rank,
'points': points,