From a30bcbaba7b4c12e413422f4b8cf2e1772b15dc0 Mon Sep 17 00:00:00 2001
From: LemonNexus <nexus@lemonlink.eu>
Date: Sun, 15 Feb 2026 00:05:05 +0000
Subject: [PATCH] fix: Improve skill scanner parser for 3-column layout

The previous parser was too simple and couldn't handle the merged text
from OCR on the skills window.

IMPROVEMENTS:
1. Clean up common headers and category names from OCR text
2. Better regex pattern that handles merged text
3. Alternative parser as fallback for heavily merged text
4. Debug logging to show parsed skills
5. Validation to filter out bad matches

PARSING LOGIC:
- Finds pattern: SkillName Rank Points
- Handles multi-word skill names (e.g., 'Combat Reflexes')
- Recognizes all EU skill ranks (Newbie through Awesome)
- Validates points are reasonable numbers

This should correctly parse skills like:
  Aim Amazing 5524
  Combat Reflexes Incredible 5991
  Handgun Grand Master 8621
---
 plugins/skill_scanner/plugin.py | 82 +++++++++++++++++++++++++++++----
 1 file changed, 74 insertions(+), 8 deletions(-)

diff --git a/plugins/skill_scanner/plugin.py b/plugins/skill_scanner/plugin.py
index 1cd6190..f1401ac 100644
--- a/plugins/skill_scanner/plugin.py
+++ b/plugins/skill_scanner/plugin.py
@@ -51,25 +51,91 @@ class SkillOCRThread(QThread):
             self.scan_error.emit(str(e))
     
     def _parse_skills(self, text):
-        """Parse skill data from OCR text."""
+        """Parse skill data from OCR text with improved handling for 3-column layout."""
         skills = {}
+        
+        # Ranks in Entropia Universe (in order)
+        RANKS = [
+            'Newbie', 'Inept', 'Beginner', 'Amateur', 'Average',
+            'Skilled', 'Expert', 'Professional', 'Master', 'Grand Master',
+            'Champion', 'Legendary', 'Guru', 'Astonishing', 'Remarkable',
+            'Outstanding', 'Marvelous', 'Prodigious', 'Amazing', 'Incredible', 'Awesome'
+        ]
+        rank_pattern = '|'.join(RANKS)
+        
+        # Clean up the text - remove common headers and junk
+        text = text.replace('SKILLS', '').replace('ALL CATEGORIES', '')
+        text = text.replace('SKILL NAME', '').replace('RANK', '').replace('POINTS', '')
+        text = text.replace('Attributes', '').replace('COMBAT', '').replace('Design', '')
+        text = text.replace('Construction', '').replace('Defense', '').replace('General', '')
+        text = text.replace('Handgun', '').replace('Heavy Melee Weapons', '')
+        text = text.replace('Information', '').replace('Inflict Melee Damage', '')
+        text = text.replace('Inflict Ranged Damage', '').replace('Light Melee Weapons', '')
+        text = text.replace('Longblades', '').replace('Medical', '').replace('Mining', '')
+        text = text.replace('Science', '').replace('Social', '').replace('Beauty', '')
+        text = text.replace('Mindforce', '')
+        
         lines = text.split('\n')
         
         for line in lines:
-            # Pattern: SkillName Rank Points
+            line = line.strip()
+            if not line:
+                continue
+            
+            # Skip category headers and short lines
+            if len(line) < 10:
+                continue
+            
+            # Try pattern: SkillName Rank Points
+            # More flexible pattern to handle merged text
             match = re.search(
-                r'(\w+(?:\s+\w+)*)\s+(Newbie|Inept|Beginner|Amateur|Average|Skilled|Expert|Professional|Master|Grand Master|Champion|Legendary|Guru|Astonishing|Remarkable|Outstanding|Marvelous|Prodigious|Amazing|Incredible|Awesome)\s+(\d+)',
+                rf'([A-Za-z][A-Za-z\s]{{2,50}}?)\s+({rank_pattern})\s+(\d{{1,6}})(?:\s|$)',
                 line, re.IGNORECASE
             )
+            
             if match:
                 skill_name = match.group(1).strip()
                 rank = match.group(2)
                 points = int(match.group(3))
-                skills[skill_name] = {
-                    'rank': rank,
-                    'points': points,
-                    'scanned_at': datetime.now().isoformat()
-                }
+                
+                # Clean up skill name
+                skill_name = skill_name.strip()
+                
+                # Validate - points should be reasonable (not too small)
+                if points > 0:
+                    skills[skill_name] = {
+                        'rank': rank,
+                        'points': points,
+                        'scanned_at': datetime.now().isoformat()
+                    }
+                    print(f"[SkillScanner] Parsed: {skill_name} = {rank} ({points})")
+        
+        # Alternative parsing: try to find skill-rank-points triplets
+        if not skills:
+            skills = self._parse_skills_alternative(text, RANKS)
+        
+        return skills
+    
+    def _parse_skills_alternative(self, text, ranks):
+        """Alternative parser for when text is heavily merged."""
+        skills = {}
+        
+        # Find all rank positions in the text
+        for rank in ranks:
+            # Look for pattern: [text] [Rank] [number]
+            pattern = rf'([A-Z][a-z]{{2,}}(?:\s+[A-Z][a-z]{{2,}}){{0,3}})\s+{rank}\s+(\d{{1,6}})'
+            matches = re.finditer(pattern, text, re.IGNORECASE)
+            
+            for match in matches:
+                skill_name = match.group(1).strip()
+                points = int(match.group(2))
+                
+                if points > 0 and len(skill_name) > 2:
+                    skills[skill_name] = {
+                        'rank': rank,
+                        'points': points,
+                        'scanned_at': datetime.now().isoformat()
+                    }
         
         return skills