Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
mael-app committed Jan 21, 2025
2 parents 430e372 + be78716 commit e2d3197
Show file tree
Hide file tree
Showing 19 changed files with 138 additions and 114 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v4

- name: Create pip cache directory
run: mkdir -p /home/runner/.cache/pip

- name: Setup Python
uses: actions/setup-python@v5
with:
Expand Down
8 changes: 0 additions & 8 deletions .idea/.gitignore

This file was deleted.

14 changes: 0 additions & 14 deletions .idea/discord.xml

This file was deleted.

15 changes: 0 additions & 15 deletions .idea/inspectionProfiles/Project_Default.xml

This file was deleted.

6 changes: 0 additions & 6 deletions .idea/inspectionProfiles/profiles_settings.xml

This file was deleted.

7 changes: 0 additions & 7 deletions .idea/misc.xml

This file was deleted.

8 changes: 0 additions & 8 deletions .idea/modules.xml

This file was deleted.

10 changes: 0 additions & 10 deletions .idea/tekbetter-scrapper.iml

This file was deleted.

6 changes: 0 additions & 6 deletions .idea/vcs.xml

This file was deleted.

8 changes: 7 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ RUN pip install --no-cache-dir --user -r requirements.txt
# Stage 2: Runtime image
FROM python:3.11-slim-bookworm

# Install runtime dependencies
RUN apt-get update && \
apt-get install -y --no-install-recommends \
nodejs \
&& rm -rf /var/lib/apt/lists/*

# OCI labels
LABEL org.opencontainers.image.title="TekBetter Scraper" \
org.opencontainers.image.description="TekBetter Scraping Service" \
Expand All @@ -36,7 +42,7 @@ RUN useradd -m -s /bin/bash scraper && \
# Set environment variables
ENV PYTHONPATH=/tekbetter \
PYTHONUNBUFFERED=1 \
SCRAPERS_CONFIG_FILE=/tekbetter/scrapers.json
SCRAPER_CONFIG_FILE=/tekbetter/scrapers.json

# Switch to non-root user
USER scraper
Expand Down
54 changes: 53 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,63 @@ The scraper will scrape the following data:
* From the `my.epitech.eu` (api.epitest.eu):
* All your projects tests results ("Moulinettes")

## Configuration Examples

### `config.json` Sample

Below is a sample configuration file (`config.json`) required for the scraper:

```json
{
"student_interval": 60,
"students": [
{
"microsoft_session": "YOUR_MICROSOFT_SESSION_TOKEN_HERE",
"tekbetter_token": "YOUR_TEKBETTER_TOKEN_HERE"
}
]
```

### Run the Scraper with Docker CLI

To run the scraper using Docker CLI, use the following command:

```sh
docker run -d \
--name tekbetter-scraper \
--restart always \
--env TEKBETTER_API_URL="https://tekbetter.ovh" \
--env SCRAPER_MODE="private" \
--env SCRAPER_CONFIG_FILE="/tekbetter/scrapers.json" \
--volume /etc/localtime:/etc/localtime:ro \
--volume $(pwd)/config.json:/tekbetter/scrapers.json \
r.tekbetter.ovh/tekbetter/tekbetter-scraper:latest
```

### Run the Scraper with Docker Compose
Alternatively, you can use a `docker-compose.yml` file to run the scraper. Before proceeding, ensure that your `config.json` is in the same directory as the `docker-compose.yml` file:

```yml
services:
tekbetter:
container_name: tekbetter-scraper
restart: always
image: r.tekbetter.ovh/tekbetter/tekbetter-scraper:latest
environment:
TEKBETTER_API_URL: "https://tekbetter.ovh"
SCRAPER_MODE: "private"
SCRAPER_CONFIG_FILE: "/tekbetter/scrapers.json"
volumes:
- /etc/localtime:/etc/localtime:ro
- ./config.json:/tekbetter/scrapers.json
```

## Environment variables

The following environment variables are required:

- `TEKBETTER_API_URL`: The URL of the TekBetter API
- `PUBLIC_SCRAPER_TOKEN`: The token to authenticate to the TekBetter API, only for the `Public mode`
- `SCRAPER_MODE`: The mode of the scraper, either `private` or `public`
- `SCRAPER_CONFIG_FILE`: The path to the configuration file, only for the `Private mode`
- `SCRAPER_CONFIG_FILE`: The path to the configuration file, only for the `Private mode`

30 changes: 24 additions & 6 deletions app/intranet/intranet_antiddos_bypass.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,27 @@
import execjs
import base64
import requests

import urllib.parse
from app.config import USER_AGENT

def decode_js_content(encoded_str):
# Équivalent Python de decodeURIComponent(escape(...))
try:
# Escape avec .encode('latin1') pour simuler le comportement de escape
# Puis décodage avec urllib.parse.unquote
escaped_str = encoded_str.encode('latin1').decode('unicode_escape')
decoded_str = urllib.parse.unquote(escaped_str)
return decoded_str
except Exception as e:
return f"Erreur lors du décodage : {e}"


class IntranetAntiDDoSBypasser:
def __init__(self):
self.cookies = {}
self.headers = {}
self.saved_cookies = {}

def extract_cookies_from_response(self, resp):
for cookie in resp.cookies:
self.cookies[cookie.name] = cookie.value
Expand All @@ -24,8 +36,8 @@ def regenerate_cookies(self):
self.saved_cookies = cookies
return cookies
except Exception as e:
pass
raise Exception("Failed to regenerate cookies")
print(e)
raise Exception("Failed to regenerate anti-ddos cookies")

def try_pass(self):
self.cookies = {}
Expand All @@ -35,12 +47,18 @@ def try_pass(self):
resp = requests.get("https://intra.epitech.eu/", headers=self.headers)
self.extract_cookies_from_response(resp)

js_puzzle = None
# Check if the response contains the javascript puzzle
if not "eval(decodeURIComponent(escape(window.atob(" in resp.text:
if "eval(decodeURIComponent(escape(window.atob(" in resp.text:
# Extract the JS code from the HTML page (decode the base64 string)
js_puzzle = base64.b64decode(resp.text.split("eval(decodeURIComponent(escape(window.atob('")[1].split("'))))")[0]).decode("utf-8")
elif "eval(decodeURIComponent(escape" in resp.text:
# Extract the JS code from the HTML page
js_puzzle = decode_js_content(resp.text.split("eval(decodeURIComponent(escape('")[1].split("'))")[0])

if js_puzzle is None:
raise Exception("Failed to extract the javascript puzzle")

# Extract the JS code from the HTML page (decode the base64 string)
js_puzzle = base64.b64decode(resp.text.split("eval(decodeURIComponent(escape(window.atob('")[1].split("'))))")[0]).decode("utf-8")

# What is secret header ?
# It's a header needed for the 2nd request, it's value is calculated with a random-variable-name.
Expand Down
25 changes: 15 additions & 10 deletions app/intranet/intranet_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,25 +21,27 @@ class IntranetNotFoundError(Exception):

class IntranetApi:
def __init__(self):
self.antiddos_bypasser = IntranetAntiDDoSBypasser()
pass

def _build_cookies(self, cookies: dict = {}):
def _build_cookies(self, cookies: dict, student: Student):
"""
Build the cookies dict for the antiddos page
:param cookies: List of cookies tuples
:return: dict
"""
cookies_dict = self.antiddos_bypasser.saved_cookies
if cookies is None:
cookies = {}
cookies_dict = student.antiddos.saved_cookies
for key, value in cookies.items():
cookies_dict[key] = value
return cookies_dict

def pass_antiddos(self):
def pass_antiddos(self, student: Student):
"""
Pass the anti-ddos page
"""
log_info("Trying to pass the anti-ddos page")
self.antiddos_bypasser.regenerate_cookies()
student.antiddos.regenerate_cookies()
log_info("Anti-ddos page passed")

def login(self, student, allow_retry=True):
Expand All @@ -54,25 +56,28 @@ def login(self, student, allow_retry=True):
# Microsoft request
msoft_resp = requests.get(INTRANET_LOGIN_URL, cookies=self._build_cookies({
"ESTSAUTHPERSISTENT": student.microsoft_session
}), headers=HEADERS, allow_redirects=False)
}, student), headers=HEADERS, allow_redirects=False)

if msoft_resp.status_code != 302:
log_error(f"Invalid Microsoft session for the student: {student.student_label}")
raise Exception(f"Invalid Microsoft session for the student: {student.student_label}")
# Get the "Location" response header
location = msoft_resp.headers["Location"]
intra_resp = requests.get(location, headers=HEADERS, cookies=self._build_cookies({}), allow_redirects=False)
intra_resp = requests.get(location, headers=HEADERS, cookies=self._build_cookies({}, student), allow_redirects=False)

if intra_resp.status_code == 503: # Anti-ddos page
if allow_retry:
self.pass_antiddos()
self.pass_antiddos(student)
return self.login(student, allow_retry=False)
log_error("AntiDDoS already passed, but still got a 503 error")
raise Exception("AntiDDoS already passed, but still got a 503 error")
if intra_resp.status_code not in [204, 302]:
log_error(f"Failed to login to Intranet API for the student: {student.student_label}")
raise IntranetLoginError(f"Failed to login to Intranet API for the student: {student.student_label}")
# Extract the token from the Set-Cookie header
if not "Set-Cookie" in intra_resp.headers:
log_error(f"Failed to login to Intranet API for the student: {student.student_label}")
raise IntranetLoginError(f"Failed to login to Intranet API for the student: {student.student_label}")
token = intra_resp.headers["Set-Cookie"].split("user=")[1].split(";")[0]
student.intra_token = token
return token
Expand All @@ -82,12 +87,12 @@ def api_request(self, url, student_obj: Student, allow_retry=True, timeout=60):
self.login(student_obj)
res = requests.get(f"https://intra.epitech.eu/{url}", headers=HEADERS, cookies=self._build_cookies({
"user": student_obj.intra_token
}), timeout=timeout)
}, student_obj), timeout=timeout)
if res.status_code == 200:
return res.json()
if res.status_code == 503:
if allow_retry:
self.pass_antiddos()
self.pass_antiddos(student=student_obj)
return self.api_request(url, student_obj, allow_retry=False, timeout=timeout)
raise Exception("Failed to pass the anti-ddos page")

Expand Down
10 changes: 5 additions & 5 deletions app/intranet/intranet_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,11 @@ def fetch_projects(self, student: Student, start_date: datetime, end_date: datet
final.append(activity)
return final

def fetch_project_slug(self, intra_project_json: dict, student: Student):
scolyear = intra_project_json['scolaryear']
codemodule = intra_project_json['codemodule']
codeinstance = intra_project_json['codeinstance']
codeacti = intra_project_json['codeacti']
def fetch_project_slug(self, ask_json: dict, student: Student):
scolyear = ask_json['year']
codemodule = ask_json['module']
codeinstance = ask_json['instance']
codeacti = ask_json['code_acti']

url = f"module/{scolyear}/{codemodule}/{codeinstance}/{codeacti}/project/?format=json"

Expand Down
16 changes: 7 additions & 9 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,13 @@ def sync_student(self, student):
traceback.print_exc()

# Fetch project slugs for the asked projects
if body["intra_projects"]:
try:
for proj in body["intra_projects"]:
if proj["codeacti"] in asked_slugs:
slug = self.intranet.fetch_project_slug(proj, student)
body["projects_slugs"][proj["codeacti"]] = slug
except Exception as e:
log_error(f"Failed to fetch Intranet project slugs for student: {student.student_label}")
traceback.print_exc()
try:
for proj in asked_slugs:
slug = self.intranet.fetch_project_slug(proj, student)
body["projects_slugs"][proj["code_acti"]] = slug
except Exception as e:
log_error(f"Failed to fetch Intranet project slugs for student: {student.student_label}")
traceback.print_exc()
log_info(f"Pushing data for student: {student.student_label}")

res = requests.post(f"{os.getenv('TEKBETTER_API_URL')}/api/scraper/push", json=body, headers={
Expand Down
3 changes: 3 additions & 0 deletions app/model/Student.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from app.intranet.intranet_antiddos_bypass import IntranetAntiDDoSBypasser


class Student:
microsoft_session: str
Expand All @@ -6,3 +8,4 @@ class Student:
intra_token: str = None
last_sync: int = 0
student_label: str = None
antiddos: IntranetAntiDDoSBypasser = None
Loading

0 comments on commit e2d3197

Please sign in to comment.