diff --git a/jc/data.py b/jc/data.py index f8bf8fa..37f8fe6 100644 --- a/jc/data.py +++ b/jc/data.py @@ -144,7 +144,7 @@ def cleanSalary(self): # Not find the salary, return de origin text if self.salary_min == 0: - self.salary_cleaned = self.salary + self.salary_cleaned = utilities.filter_salary_fr(self.salary) return # Month salary diff --git a/jobboards/RegionJob.py b/jobboards/RegionJob.py index 0749000..35f1994 100644 --- a/jobboards/RegionJob.py +++ b/jobboards/RegionJob.py @@ -173,6 +173,7 @@ def analyzePage(self, page): # Salary self.datas['salary'] = self._extractRubrique("Salaire", item) + self.filterSalaries(self.datas) # Insert to jobboard table self.datas['state'] = 'ACTIVE' @@ -204,6 +205,13 @@ def createTable(self,): location TEXT, \ department TEXT, \ salary TEXT, \ + salary_min FLOAT, \ + salary_max FLOAT, \ + salary_nbperiod INTEGER, \ + salary_unit FLOAT, \ + salary_bonus TEXT, \ + salary_minbonus FLOAT, \ + salary_maxbonus FLOAT, \ state TEXT, \ PRIMARY KEY(offerid))""" % self.name) @@ -212,7 +220,7 @@ def insertToJBTable(self): conn.text_factory = str cursor = conn.cursor() try: - cursor.execute("INSERT INTO jb_%s VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)" % + cursor.execute("INSERT INTO jb_%s VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)" % self.name, ( self.datas['offerid'], self.datas['lastupdate'], @@ -228,6 +236,13 @@ def insertToJBTable(self): self.datas['location'], self.datas['department'], self.datas['salary'], + self.datas['salary_min'], + self.datas['salary_max'], + self.datas['salary_nbperiod'], + self.datas['salary_unit'], + self.datas['salary_bonus'], + self.datas['salary_minbonus'], + self.datas['salary_maxbonus'], self.datas['state'], ) @@ -260,6 +275,13 @@ def createOffer(self, data): o.location = data['location'] o.department = data['department'] o.salary = data['salary'] + o.salary_min = data['salary_min'] + o.salary_max = data['salary_max'] + o.salary_unit = data['salary_unit'] + o.salary_nbperiod = data['salary_nbperiod'] + o.salary_bonus = data['salary_bonus'] + o.salary_minbonus = data['salary_minbonus'] + o.salary_maxbonus = data['salary_maxbonus'] o.date_pub = data['date_pub'] o.date_add = data['date_add'] o.state = data['state'] @@ -268,3 +290,140 @@ def createOffer(self, data): return o return None + + def filterSalaries(self, data): + minbonus = 0 + maxbonus = 0 + + if self.datas['salary']: + # Salary + self.datas['salary_unit'] = '' + self.datas['salary_min'] = 0 + self.datas['salary_max'] = 0 + self.datas['salary_nbperiod'] = 0 + # Bonus + self.datas['salary_bonus'] = '' + self.datas['salary_minbonus'] = 0 + self.datas['salary_maxbonus'] = 0 + + # Search salary range + m = re.search( + ur'([0-9]+)/([0-9]+).*par mois sur (.*?) mois\.?(.*)', + self.datas['salary'], + flags=re.MULTILINE | re.DOTALL + ) + found = False + if m: + found = True + self.datas['salary_unit'] = 1 + self.datas['salary_min'] = m.group(1) + self.datas['salary_max'] = m.group(2) + self.datas['salary_nbperiod'] = int(m.group(3)) + self.datas['salary_bonus'] = m.group(4) + + if not found: + m = re.search( + ur'.*?([0-9]+) à ([0-9]+) Euros\.?(.*)', + self.datas['salary'], + flags=re.MULTILINE | re.DOTALL + ) + found = False + if m: + found = True + self.datas['salary_unit'] = 1 + self.datas['salary_min'] = m.group(1) + self.datas['salary_max'] = m.group(2) + self.datas['salary_nbperiod'] = 12 + self.datas['salary_bonus'] = m.group(3) + + if not found: + m = re.search( + ur'.*?([0-9]+) à ([0-9]+) KEuros\.?(.*)', + self.datas['salary'], + flags=re.MULTILINE | re.DOTALL + ) + found = False + if m: + found = True + self.datas['salary_unit'] = 1 + self.datas['salary_min'] = m.group(1) + self.datas['salary_max'] = m.group(2) + self.datas['salary_nbperiod'] = 12 + self.datas['salary_bonus'] = m.group(3) + + if not found: + m = re.search( + ur'([0-9]+)/([0-9]+) KE(uros)?(.*)', + self.datas['salary'], + flags=re.MULTILINE | re.DOTALL + ) + found = False + if m: + found = True + self.datas['salary_unit'] = 12 + self.datas['salary_min'] = "%s000" % m.group(1) + self.datas['salary_max'] = "%s000" % m.group(2) + self.datas['salary_nbperiod'] = 12 + self.datas['salary_bonus'] = m.group(4) + + if not found: + m = re.search( + ur'.*?([0-9]+)-([0-9]+) KE(uros)?(.*)', + self.datas['salary'], + flags=re.MULTILINE | re.DOTALL + ) + found = False + if m: + found = True + self.datas['salary_unit'] = 12 + self.datas['salary_min'] = "%s000" % m.group(1) + self.datas['salary_max'] = "%s000" % m.group(2) + self.datas['salary_nbperiod'] = 12 + self.datas['salary_bonus'] = m.group(4) + + if found: + # Format + self.datas['salary_min'] = float( + re.sub( + r'[\W_]', + '', + self.datas['salary_min'] + ) + ) + self.datas['salary_max'] = float( + re.sub( + r'[\W_]', + '', + self.datas['salary_max'] + ) + ) + + if not found: + # Search salary + m = re.search( + ur'(.*?) Euros(/mois)?\.(.*)', + self.datas['salary'], + flags=re.MULTILINE | re.DOTALL + ) + if m: + found = True + self.datas['salary_unit'] = 1 + self.datas['salary_min'] = m.group(1) + self.datas['salary_max'] = 0 + self.datas['salary_nbperiod'] = 12 + self.datas['salary_bonus'] = m.group(3) + + # Format + if found: + self.datas['salary_min'] = float( + re.sub( + r'[\W_]', + '', + self.datas['salary_min'] + ) + ) + + if self.datas['salary_unit'] == 'Annuel': + self.datas['salary_unit'] = 12 + elif self.datas['salary_unit'] == 'Mensuel': + self.datas['salary_unit'] = 1 diff --git a/utilities.py b/utilities.py index a7b2584..eab24ae 100644 --- a/utilities.py +++ b/utilities.py @@ -327,8 +327,13 @@ def db_delete_jobboard_datas(configs, jobboardname): conn = lite.connect(configs.globals['database']) cursor = conn.cursor() + # Check if must delete all offers + filter = '' + if jobboardname != 'ALL': + filter = "where source='%s'" % jobboardname + # Delete jobbord datas in offers - sql = "delete from offers where source='%s'" % jobboardname + sql = "delete from offers %s" % filter cursor.execute(sql) # # Delete jobbor datas on jobboard table @@ -460,9 +465,16 @@ def filter_location_fr(location): def filter_salary_fr(salary): # TODO : use regexp once whe have a better view of possible combinations # TODO : use something similar as ^...$ - # Selon profil - salary = re.sub(ur'.*selon profil.*', "NA", salary, flags=re.DOTALL | re.IGNORECASE) - salary = re.sub(ur'Selon diplôme et expérience', "NA", salary) + flags = re.DOTALL | re.IGNORECASE + men_salary = salary + + salary = re.sub(ur'selon profil', '', salary, flags=flags) + salary = re.sub(ur"Selon (l')?exp.?rien.?e", '', salary, flags=flags) + salary = re.sub(ur'.? n.?gocier', '', salary, flags=flags) + salary = re.sub(ur'.? d.?finir', '', salary, flags=flags) + salary = re.sub(ur'Non pr.?cis.?', '', salary, flags=flags) + salary = re.sub(ur'Selon dipl.?me', '', salary, flags=flags) + salary = re.sub(ur'et exp.?rience', '', salary, flags=flags) #salary = re.sub(ur'fixe + variable selon profil', "NA", salary) #salary = re.sub(ur'Fixe+Variable selon profil', "NA", salary) #salary = re.sub(ur'Fixe + Variable selon profil', "NA", salary) @@ -476,9 +488,6 @@ def filter_salary_fr(salary): #salary = re.sub(ur'à négocier selon le profil', "NA", salary) #salary = re.sub(ur'à déterminer selon profil', "NA", salary) salary = re.sub(ur'à définir selon expérience', "NA", salary) - salary = re.sub(ur'A négocier selon expérience.', "NA", salary) - salary = re.sub(ur'A négocier selon expérience', "NA", salary) - salary = re.sub(ur'à négocier selon expérience', "NA", salary) salary = re.sub(ur'à négocier selon exp', "NA", salary) salary = re.sub(ur'à negocier K€ brut/an', "NA", salary) #salary = re.sub(ur'A voir selon profil', "NA", salary) @@ -593,4 +602,11 @@ def filter_salary_fr(salary): salary = re.sub(ur'NC', "NA", salary) salary = re.sub(ur'nc', "NA", salary) + # Final clean + salary = re.sub(ur'\.$', "", salary) + if salary.strip() == '': + salary = "NA" + else: + salary = men_salary + return salary