Skip to content

Commit

Permalink
Harmonize salary informations for RegionJob jobboard #84
Browse files Browse the repository at this point in the history
  • Loading branch information
badele committed Dec 1, 2013
1 parent 9e79e7c commit c118f47
Show file tree
Hide file tree
Showing 3 changed files with 184 additions and 9 deletions.
2 changes: 1 addition & 1 deletion jc/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def cleanSalary(self):

# Not find the salary, return de origin text
if self.salary_min == 0:
self.salary_cleaned = self.salary
self.salary_cleaned = utilities.filter_salary_fr(self.salary)
return

# Month salary
Expand Down
161 changes: 160 additions & 1 deletion jobboards/RegionJob.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ def analyzePage(self, page):

# Salary
self.datas['salary'] = self._extractRubrique("Salaire", item)
self.filterSalaries(self.datas)

# Insert to jobboard table
self.datas['state'] = 'ACTIVE'
Expand Down Expand Up @@ -204,6 +205,13 @@ def createTable(self,):
location TEXT, \
department TEXT, \
salary TEXT, \
salary_min FLOAT, \
salary_max FLOAT, \
salary_nbperiod INTEGER, \
salary_unit FLOAT, \
salary_bonus TEXT, \
salary_minbonus FLOAT, \
salary_maxbonus FLOAT, \
state TEXT, \
PRIMARY KEY(offerid))""" % self.name)

Expand All @@ -212,7 +220,7 @@ def insertToJBTable(self):
conn.text_factory = str
cursor = conn.cursor()
try:
cursor.execute("INSERT INTO jb_%s VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)" %
cursor.execute("INSERT INTO jb_%s VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)" %
self.name, (
self.datas['offerid'],
self.datas['lastupdate'],
Expand All @@ -228,6 +236,13 @@ def insertToJBTable(self):
self.datas['location'],
self.datas['department'],
self.datas['salary'],
self.datas['salary_min'],
self.datas['salary_max'],
self.datas['salary_nbperiod'],
self.datas['salary_unit'],
self.datas['salary_bonus'],
self.datas['salary_minbonus'],
self.datas['salary_maxbonus'],
self.datas['state'],

)
Expand Down Expand Up @@ -260,6 +275,13 @@ def createOffer(self, data):
o.location = data['location']
o.department = data['department']
o.salary = data['salary']
o.salary_min = data['salary_min']
o.salary_max = data['salary_max']
o.salary_unit = data['salary_unit']
o.salary_nbperiod = data['salary_nbperiod']
o.salary_bonus = data['salary_bonus']
o.salary_minbonus = data['salary_minbonus']
o.salary_maxbonus = data['salary_maxbonus']
o.date_pub = data['date_pub']
o.date_add = data['date_add']
o.state = data['state']
Expand All @@ -268,3 +290,140 @@ def createOffer(self, data):
return o

return None

def filterSalaries(self, data):
minbonus = 0
maxbonus = 0

if self.datas['salary']:
# Salary
self.datas['salary_unit'] = ''
self.datas['salary_min'] = 0
self.datas['salary_max'] = 0
self.datas['salary_nbperiod'] = 0
# Bonus
self.datas['salary_bonus'] = ''
self.datas['salary_minbonus'] = 0
self.datas['salary_maxbonus'] = 0

# Search salary range
m = re.search(
ur'([0-9]+)/([0-9]+).*par mois sur (.*?) mois\.?(.*)',
self.datas['salary'],
flags=re.MULTILINE | re.DOTALL
)
found = False
if m:
found = True
self.datas['salary_unit'] = 1
self.datas['salary_min'] = m.group(1)
self.datas['salary_max'] = m.group(2)
self.datas['salary_nbperiod'] = int(m.group(3))
self.datas['salary_bonus'] = m.group(4)

if not found:
m = re.search(
ur'.*?([0-9]+) à ([0-9]+) Euros\.?(.*)',
self.datas['salary'],
flags=re.MULTILINE | re.DOTALL
)
found = False
if m:
found = True
self.datas['salary_unit'] = 1
self.datas['salary_min'] = m.group(1)
self.datas['salary_max'] = m.group(2)
self.datas['salary_nbperiod'] = 12
self.datas['salary_bonus'] = m.group(3)

if not found:
m = re.search(
ur'.*?([0-9]+) à ([0-9]+) KEuros\.?(.*)',
self.datas['salary'],
flags=re.MULTILINE | re.DOTALL
)
found = False
if m:
found = True
self.datas['salary_unit'] = 1
self.datas['salary_min'] = m.group(1)
self.datas['salary_max'] = m.group(2)
self.datas['salary_nbperiod'] = 12
self.datas['salary_bonus'] = m.group(3)

if not found:
m = re.search(
ur'([0-9]+)/([0-9]+) KE(uros)?(.*)',
self.datas['salary'],
flags=re.MULTILINE | re.DOTALL
)
found = False
if m:
found = True
self.datas['salary_unit'] = 12
self.datas['salary_min'] = "%s000" % m.group(1)
self.datas['salary_max'] = "%s000" % m.group(2)
self.datas['salary_nbperiod'] = 12
self.datas['salary_bonus'] = m.group(4)

if not found:
m = re.search(
ur'.*?([0-9]+)-([0-9]+) KE(uros)?(.*)',
self.datas['salary'],
flags=re.MULTILINE | re.DOTALL
)
found = False
if m:
found = True
self.datas['salary_unit'] = 12
self.datas['salary_min'] = "%s000" % m.group(1)
self.datas['salary_max'] = "%s000" % m.group(2)
self.datas['salary_nbperiod'] = 12
self.datas['salary_bonus'] = m.group(4)

if found:
# Format
self.datas['salary_min'] = float(
re.sub(
r'[\W_]',
'',
self.datas['salary_min']
)
)
self.datas['salary_max'] = float(
re.sub(
r'[\W_]',
'',
self.datas['salary_max']
)
)

if not found:
# Search salary
m = re.search(
ur'(.*?) Euros(/mois)?\.(.*)',
self.datas['salary'],
flags=re.MULTILINE | re.DOTALL
)
if m:
found = True
self.datas['salary_unit'] = 1
self.datas['salary_min'] = m.group(1)
self.datas['salary_max'] = 0
self.datas['salary_nbperiod'] = 12
self.datas['salary_bonus'] = m.group(3)

# Format
if found:
self.datas['salary_min'] = float(
re.sub(
r'[\W_]',
'',
self.datas['salary_min']
)
)

if self.datas['salary_unit'] == 'Annuel':
self.datas['salary_unit'] = 12
elif self.datas['salary_unit'] == 'Mensuel':
self.datas['salary_unit'] = 1
30 changes: 23 additions & 7 deletions utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,8 +327,13 @@ def db_delete_jobboard_datas(configs, jobboardname):
conn = lite.connect(configs.globals['database'])
cursor = conn.cursor()

# Check if must delete all offers
filter = ''
if jobboardname != 'ALL':
filter = "where source='%s'" % jobboardname

# Delete jobbord datas in offers
sql = "delete from offers where source='%s'" % jobboardname
sql = "delete from offers %s" % filter
cursor.execute(sql)

# # Delete jobbor datas on jobboard table
Expand Down Expand Up @@ -460,9 +465,16 @@ def filter_location_fr(location):
def filter_salary_fr(salary):
# TODO : use regexp once whe have a better view of possible combinations
# TODO : use something similar as ^...$
# Selon profil
salary = re.sub(ur'.*selon profil.*', "NA", salary, flags=re.DOTALL | re.IGNORECASE)
salary = re.sub(ur'Selon diplôme et expérience', "NA", salary)
flags = re.DOTALL | re.IGNORECASE
men_salary = salary

salary = re.sub(ur'selon profil', '', salary, flags=flags)
salary = re.sub(ur"Selon (l')?exp.?rien.?e", '', salary, flags=flags)
salary = re.sub(ur'.? n.?gocier', '', salary, flags=flags)
salary = re.sub(ur'.? d.?finir', '', salary, flags=flags)
salary = re.sub(ur'Non pr.?cis.?', '', salary, flags=flags)
salary = re.sub(ur'Selon dipl.?me', '', salary, flags=flags)
salary = re.sub(ur'et exp.?rience', '', salary, flags=flags)
#salary = re.sub(ur'fixe + variable selon profil', "NA", salary)
#salary = re.sub(ur'Fixe+Variable selon profil', "NA", salary)
#salary = re.sub(ur'Fixe + Variable selon profil', "NA", salary)
Expand All @@ -476,9 +488,6 @@ def filter_salary_fr(salary):
#salary = re.sub(ur'à négocier selon le profil', "NA", salary)
#salary = re.sub(ur'à déterminer selon profil', "NA", salary)
salary = re.sub(ur'à définir selon expérience', "NA", salary)
salary = re.sub(ur'A négocier selon expérience.', "NA", salary)
salary = re.sub(ur'A négocier selon expérience', "NA", salary)
salary = re.sub(ur'à négocier selon expérience', "NA", salary)
salary = re.sub(ur'à négocier selon exp', "NA", salary)
salary = re.sub(ur'à negocier K€ brut/an', "NA", salary)
#salary = re.sub(ur'A voir selon profil', "NA", salary)
Expand Down Expand Up @@ -593,4 +602,11 @@ def filter_salary_fr(salary):
salary = re.sub(ur'NC', "NA", salary)
salary = re.sub(ur'nc', "NA", salary)

# Final clean
salary = re.sub(ur'\.$', "", salary)
if salary.strip() == '':
salary = "NA"
else:
salary = men_salary

return salary

0 comments on commit c118f47

Please sign in to comment.