From 18bc87980f86b165fcd306b7aa508abe825b9114 Mon Sep 17 00:00:00 2001 From: rfhferreira <rfhferreira@inf.ufpr.br> Date: Tue, 3 Jun 2025 14:03:38 -0300 Subject: [PATCH] Adding Indexer --- ElasticSearch/indexer.py | 142 +++++++++++++++++++++++++++++++ ElasticSearch/requirements.txt | 2 + ElasticSearch/sql/collection.sql | 34 ++++++++ ElasticSearch/sql/resource.sql | 53 ++++++++++++ ElasticSearch/sql/user.sql | 31 +++++++ 5 files changed, 262 insertions(+) create mode 100644 ElasticSearch/indexer.py create mode 100644 ElasticSearch/requirements.txt create mode 100644 ElasticSearch/sql/collection.sql create mode 100644 ElasticSearch/sql/resource.sql create mode 100644 ElasticSearch/sql/user.sql diff --git a/ElasticSearch/indexer.py b/ElasticSearch/indexer.py new file mode 100644 index 0000000..8db9e59 --- /dev/null +++ b/ElasticSearch/indexer.py @@ -0,0 +1,142 @@ +import csv +import os +import json +import requests +from datetime import datetime +from dotenv import load_dotenv + +load_dotenv('../.env') + +es_host = os.getenv('ELASTICSEARCH_HOST') +es_port = os.getenv('ELASTICSEARCH_PORT') + +index_prefix = os.getenv('ELASTIC_INDEX_PREFIX') + +# Elasticsearch credentials +es_username = os.getenv('ELASTICSEARCH_USER') +es_password = os.getenv('ELASTICSEARCH_PASSWORD') + +print(es_host, es_port, es_password, es_username, index_prefix) + +lo_file_path = './csvs/lo.csv' +users_file_path = './csvs/users.csv' +collections_file_path = './csvs/collections.csv' + +def read_lo_metadata(csv_file_path): + print("\n\nREADING LOS\n\n") + metadata = [] + with open(csv_file_path, newline='', encoding='utf-8') as csvfile: + reader = csv.DictReader(csvfile, delimiter="|") + for row in reader: + parsed_row = { + "id": int(row["id"]), + "name": row["name"], + "author": row["author"], + "description": row["description"], + "link": row["link"], + "created_at": datetime.fromisoformat(row["created_at"]) if row["created_at"] else None, + "user": row["user"], + "educational_stages": row["educational_stages"].split(",") if row["educational_stages"] else [], + "languages": row["languages"].split(",") if row["languages"] else [], + "subjects": row["subjects"].split(",") if row["subjects"] else [], + "license": row["license"], + "object_type": row["object_type"], + "resource_state": row["resource_state"], + "user": row["user"], + "user_id": row["user_id"], + "views": int(row["views"]), + "downloads": int(row["downloads"]), + "likes": int(row["likes"]), + "shares": int(row["shares"]), + "score": float(row["score"]), + "comments": int(row["comments"]), + "saves": int(row["saves"]) + } + + print(parsed_row) + metadata.append(parsed_row) + return metadata + +def read_users_metadata(csv_file_path): + print("\n\nREADING USERS\n\n") + metadata = [] + with open(csv_file_path, newline='', encoding='utf-8') as csvfile: + reader = csv.DictReader(csvfile, delimiter="|") + for row in reader: + parsed_row = { + "id": int(row["id"]), + "name": row["name"], + "username": row["username"], + "description": row["description"], + "roles": row["roles"].split(",") if row["roles"] else [], + "institutions": row["institutions"], + "created_at": datetime.fromisoformat(row["created_at"].split('.')[0]) if row["created_at"] else None, # cortar fora milissegundos + "is_active": row["is_active"] == "t" if row["is_active"] in ("t", "f") else None, # traduz o bool do postgres (t, f) para o do ES (true, false) + "score": float(row["score"]), + "likes": int(row["likes"]), + "followers": int(row["followers"]), + "approved_resources": int(row["approved_resources"]) + } + print(parsed_row) + metadata.append(parsed_row) + return metadata + +def read_collections_metadata(csv_file_path): + print("\n\nREADING COLLECTIONS\n\n") + metadata = [] + with open(csv_file_path, newline='', encoding='utf-8') as csvfile: + reader = csv.DictReader(csvfile, delimiter="|") + for row in reader: + parsed_row = { + "id": int(row["id"]), + "name": row["name"], + "user": row["user"], + "user_id": row["user_id"], + "description": row["description"], + "created_at": datetime.fromisoformat(row["created_at"].split('.')[0]) if row["created_at"] else None, # cortar fora milissegundos + "is_private": row["is_private"] == "t" if row["is_private"] in ("t", "f") else None, # traduz o bool do postgres (t, f) para o do ES (true, false) + "is_active": row["is_active"] == "t" if row["is_active"] in ("t", "f") else None, # traduz o bool do postgres (t, f) para o do ES (true, false) + "score": float(row["score"]), + "views": int(row["views"]), + "downloads": int(row["downloads"]), + "likes": int(row["likes"]), + "shares": int(row["shares"]), + "followers": int(row["followers"]) + } + print(parsed_row) + metadata.append(parsed_row) + return metadata + +def index_metadata_to_elasticsearch(metadata, index): + for item in metadata: + + es_doc = {key.lower(): (value.isoformat() if isinstance(value, datetime) else value) for key, value in item.items()} + + + # print('-------------------------') + # print(item['name']) + # print('-------------------------') + + response = requests.post( + f'{es_host}:{es_port}/{index_prefix}_{index}/_doc', + headers={"Content-Type": "application/json"}, + data=json.dumps(es_doc), + auth=(es_username, es_password) # Pass credentials here + ) + + print(response.json()) + + +def main(): + metadata = read_lo_metadata(lo_file_path) + index_metadata_to_elasticsearch(metadata, 'resources') + + metadata = read_users_metadata(users_file_path) + index_metadata_to_elasticsearch(metadata, 'users') + + metadata = read_collections_metadata(collections_file_path) + index_metadata_to_elasticsearch(metadata, 'collections') + + +if __name__ == "__main__": + main() diff --git a/ElasticSearch/requirements.txt b/ElasticSearch/requirements.txt new file mode 100644 index 0000000..282562e --- /dev/null +++ b/ElasticSearch/requirements.txt @@ -0,0 +1,2 @@ +requests +dotenv diff --git a/ElasticSearch/sql/collection.sql b/ElasticSearch/sql/collection.sql new file mode 100644 index 0000000..3b5c9bf --- /dev/null +++ b/ElasticSearch/sql/collection.sql @@ -0,0 +1,34 @@ +select + col.id, + col.name, + users.name as user, + users.id as user_id, + col.description, + col.created_at, + col.is_private, + col.is_active, + colst.score, + colst.views, + colst.downloads, + colst.likes, + colst.shares, + colst.follows as followers +from + collections col + left join users on users.id = col.user_id + left join collection_stats as colst on colst.id = col.id +group by + col.id, + col.name, + users.name, + users.id, + col.description, + col.created_at, + col.is_private, + col.is_active, + colst.score, + colst.views, + colst.downloads, + colst.likes, + colst.shares, + colst.follows; diff --git a/ElasticSearch/sql/resource.sql b/ElasticSearch/sql/resource.sql new file mode 100644 index 0000000..2b330a9 --- /dev/null +++ b/ElasticSearch/sql/resource.sql @@ -0,0 +1,53 @@ +select + re.id, + re.name, + re.author, + re.description, + re.link, + re.published_at as created_at, + string_agg(distinct es.name, ',') as educational_stages, + string_agg(distinct lg.name, ',') as languages, + string_agg(distinct sj.name, ',') as subjects, + lc.name as license, + ot.name as object_type, + re.state as resource_state, + us.name as user, + us.id as user_id, + rest.views as views, + rest.downloads as downloads, + rest.likes as likes, + rest.shares as shares, + rest.score as score, + rest.comments as comments, + count(distinct cr.id) as saves +from + resources re + left join resource_educational_stages res on res.resource_id = re.id + left join educational_stages es on res.educational_stage_id = es.id + left join resource_languages rel on rel.resource_id = re.id + left join languages lg on rel.language_id = lg.id + left join resource_subjects rs on rs.resource_id = re.id + left join subjects sj on rs.subject_id = sj.id + left join licenses lc on re.license_id = lc.id + left join object_types ot on re.object_type_id = ot.id + left join users us on us.id = re.user_id + left join resource_stats rest on rest.id = re.resource_stats_id + left join collection_resources cr on cr.resource_id = re.id +group by + re.id, + re.name, + re.author, + re.description, + re.link, + re.published_at, + re.state, + ot.name, + lc.name, + us.name, + us.id, + rest.views, + rest.downloads, + rest.likes, + rest.shares, + rest.score, + rest.comments; diff --git a/ElasticSearch/sql/user.sql b/ElasticSearch/sql/user.sql new file mode 100644 index 0000000..0f1962a --- /dev/null +++ b/ElasticSearch/sql/user.sql @@ -0,0 +1,31 @@ +select + users.id, + users.name, + users.username, + users.description, + string_agg(distinct roles.name, ',') as roles, + string_agg(distinct inst.name, ',') as institutions, + users.created_at, + users.is_active, + usst.score, + usst.likes_received as likes, + usst.followers, + usst.approved_resources +from + users + left join user_roles ur on users.id = ur.user_id + left join roles on roles.id = ur.role_id + left join user_institutions ui on ui.user_id = users.id + left join institutions inst on inst.id = ui.institution_id + left join user_stats usst on usst.id = users.user_stats_id +group by + users.id, + users.name, + users.username, + users.description, + users.created_at, + users.is_active, + usst.score, + usst.likes_received, + usst.followers, + usst.approved_resources; -- GitLab