fix: Improve skill scanner parser for 3-column layout
The previous parser was too simple and couldn't handle the merged text from OCR on the skills window. IMPROVEMENTS: 1. Clean up common headers and category names from OCR text 2. Better regex pattern that handles merged text 3. Alternative parser as fallback for heavily merged text 4. Debug logging to show parsed skills 5. Validation to filter out bad matches PARSING LOGIC: - Finds pattern: SkillName Rank Points - Handles multi-word skill names (e.g., 'Combat Reflexes') - Recognizes all EU skill ranks (Newbie through Awesome) - Validates points are reasonable numbers This should correctly parse skills like: Aim Amazing 5524 Combat Reflexes Incredible 5991 Handgun Grand Master 8621
This commit is contained in:
parent
8e49f4e45e
commit
a30bcbaba7
|
|
@ -51,25 +51,91 @@ class SkillOCRThread(QThread):
|
|||
self.scan_error.emit(str(e))
|
||||
|
||||
def _parse_skills(self, text):
|
||||
"""Parse skill data from OCR text."""
|
||||
"""Parse skill data from OCR text with improved handling for 3-column layout."""
|
||||
skills = {}
|
||||
|
||||
# Ranks in Entropia Universe (in order)
|
||||
RANKS = [
|
||||
'Newbie', 'Inept', 'Beginner', 'Amateur', 'Average',
|
||||
'Skilled', 'Expert', 'Professional', 'Master', 'Grand Master',
|
||||
'Champion', 'Legendary', 'Guru', 'Astonishing', 'Remarkable',
|
||||
'Outstanding', 'Marvelous', 'Prodigious', 'Amazing', 'Incredible', 'Awesome'
|
||||
]
|
||||
rank_pattern = '|'.join(RANKS)
|
||||
|
||||
# Clean up the text - remove common headers and junk
|
||||
text = text.replace('SKILLS', '').replace('ALL CATEGORIES', '')
|
||||
text = text.replace('SKILL NAME', '').replace('RANK', '').replace('POINTS', '')
|
||||
text = text.replace('Attributes', '').replace('COMBAT', '').replace('Design', '')
|
||||
text = text.replace('Construction', '').replace('Defense', '').replace('General', '')
|
||||
text = text.replace('Handgun', '').replace('Heavy Melee Weapons', '')
|
||||
text = text.replace('Information', '').replace('Inflict Melee Damage', '')
|
||||
text = text.replace('Inflict Ranged Damage', '').replace('Light Melee Weapons', '')
|
||||
text = text.replace('Longblades', '').replace('Medical', '').replace('Mining', '')
|
||||
text = text.replace('Science', '').replace('Social', '').replace('Beauty', '')
|
||||
text = text.replace('Mindforce', '')
|
||||
|
||||
lines = text.split('\n')
|
||||
|
||||
for line in lines:
|
||||
# Pattern: SkillName Rank Points
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Skip category headers and short lines
|
||||
if len(line) < 10:
|
||||
continue
|
||||
|
||||
# Try pattern: SkillName Rank Points
|
||||
# More flexible pattern to handle merged text
|
||||
match = re.search(
|
||||
r'(\w+(?:\s+\w+)*)\s+(Newbie|Inept|Beginner|Amateur|Average|Skilled|Expert|Professional|Master|Grand Master|Champion|Legendary|Guru|Astonishing|Remarkable|Outstanding|Marvelous|Prodigious|Amazing|Incredible|Awesome)\s+(\d+)',
|
||||
rf'([A-Za-z][A-Za-z\s]{{2,50}}?)\s+({rank_pattern})\s+(\d{{1,6}})(?:\s|$)',
|
||||
line, re.IGNORECASE
|
||||
)
|
||||
|
||||
if match:
|
||||
skill_name = match.group(1).strip()
|
||||
rank = match.group(2)
|
||||
points = int(match.group(3))
|
||||
skills[skill_name] = {
|
||||
'rank': rank,
|
||||
'points': points,
|
||||
'scanned_at': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Clean up skill name
|
||||
skill_name = skill_name.strip()
|
||||
|
||||
# Validate - points should be reasonable (not too small)
|
||||
if points > 0:
|
||||
skills[skill_name] = {
|
||||
'rank': rank,
|
||||
'points': points,
|
||||
'scanned_at': datetime.now().isoformat()
|
||||
}
|
||||
print(f"[SkillScanner] Parsed: {skill_name} = {rank} ({points})")
|
||||
|
||||
# Alternative parsing: try to find skill-rank-points triplets
|
||||
if not skills:
|
||||
skills = self._parse_skills_alternative(text, RANKS)
|
||||
|
||||
return skills
|
||||
|
||||
def _parse_skills_alternative(self, text, ranks):
|
||||
"""Alternative parser for when text is heavily merged."""
|
||||
skills = {}
|
||||
|
||||
# Find all rank positions in the text
|
||||
for rank in ranks:
|
||||
# Look for pattern: [text] [Rank] [number]
|
||||
pattern = rf'([A-Z][a-z]{{2,}}(?:\s+[A-Z][a-z]{{2,}}){{0,3}})\s+{rank}\s+(\d{{1,6}})'
|
||||
matches = re.finditer(pattern, text, re.IGNORECASE)
|
||||
|
||||
for match in matches:
|
||||
skill_name = match.group(1).strip()
|
||||
points = int(match.group(2))
|
||||
|
||||
if points > 0 and len(skill_name) > 2:
|
||||
skills[skill_name] = {
|
||||
'rank': rank,
|
||||
'points': points,
|
||||
'scanned_at': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
return skills
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue