mirror of
https://git.phreedom.club/localhost_frssoft/FMN_bot.git
synced 2025-04-19 22:56:31 +02:00
81 lines
2.9 KiB
Python
81 lines
2.9 KiB
Python
|
import sqlite3
|
|||
|
import gzip
|
|||
|
|
|||
|
conn = sqlite3.connect("imdb_titles.sqlite")
|
|||
|
c = conn.cursor()
|
|||
|
|
|||
|
c.execute(
|
|||
|
'''CREATE TABLE IF NOT EXISTS titles(tt_id INTEGER UNIQUE, type VARCHAR (50), original_name VARCHAR (500) DEFAULT NULL, ru_name VARCHAR (500) DEFAULT NULL, year INTEGER DEFAULT NULL)''')
|
|||
|
c.execute("PRAGMA synchronous = OFF")
|
|||
|
c.execute("PRAGMA optimize")
|
|||
|
conn.commit()
|
|||
|
|
|||
|
|
|||
|
def convert_tsv_to_db(title_basics_tsv):
|
|||
|
'''Конвертирование основного датасета в sqlite базу, выполняется весьма долго (5-10 минут)'''
|
|||
|
with gzip.open(title_basics_tsv, mode='rt') as file:
|
|||
|
for line in file:
|
|||
|
line = line.split("\t")
|
|||
|
try:
|
|||
|
tt_id = int(line[0].split("tt")[1])
|
|||
|
tt_type = line[1]
|
|||
|
original_name = line[3]
|
|||
|
ru_name = None
|
|||
|
year = line[5]
|
|||
|
|
|||
|
if tt_type not in ("movie", "video"):
|
|||
|
original_name = None
|
|||
|
year = "\\N"
|
|||
|
else:
|
|||
|
print(tt_id, tt_type, original_name, ru_name, year)
|
|||
|
|
|||
|
if year == "\\N":
|
|||
|
year = None
|
|||
|
else:
|
|||
|
year = int(year)
|
|||
|
c.execute("INSERT OR REPLACE INTO titles(tt_id, type, original_name, ru_name, year) VALUES (?, ?, ?, ?, ?)",
|
|||
|
(tt_id, tt_type, original_name, ru_name, year))
|
|||
|
except Exception as E:
|
|||
|
print(E)
|
|||
|
pass
|
|||
|
conn.commit()
|
|||
|
|
|||
|
def extract_ru_locale_from_tsv(title_akas_tsv):
|
|||
|
'''Конвертирование датасета с локализованными названиями и последующее добавление в базу'''
|
|||
|
with gzip.open(title_akas_tsv, mode='rt') as file:
|
|||
|
for line in file:
|
|||
|
line = line.split("\t")
|
|||
|
try:
|
|||
|
tt_region = line[3]
|
|||
|
if tt_region != "RU":
|
|||
|
continue
|
|||
|
|
|||
|
tt_id = int(line[0].split("tt")[1])
|
|||
|
tt_type = c.execute(f"SELECT type FROM titles WHERE tt_id={tt_id}").fetchone()[0]
|
|||
|
if tt_type not in ("movie", "video"):
|
|||
|
continue
|
|||
|
ru_name = line[2]
|
|||
|
print(ru_name, tt_type)
|
|||
|
c.execute("UPDATE titles SET ru_name = ? WHERE tt_id = ?", (ru_name, tt_id))
|
|||
|
|
|||
|
except Exception as E:
|
|||
|
print(E)
|
|||
|
pass
|
|||
|
conn.commit()
|
|||
|
|
|||
|
def convert_datasets_to_db():
|
|||
|
print("Converting tsv dataset to sqlite...")
|
|||
|
convert_tsv_to_db("title.basics.tsv.gz")
|
|||
|
print("Unpack ru locale...")
|
|||
|
extract_ru_locale_from_tsv("title.akas.tsv.gz")
|
|||
|
|
|||
|
|
|||
|
def get_title_by_id(films_ids=list):
|
|||
|
tt_list = []
|
|||
|
for i in films_ids:
|
|||
|
tt_film = c.execute(f"SELECT * FROM titles WHERE tt_id={i}").fetchone()
|
|||
|
tt_list.append(tt_film)
|
|||
|
print(tt_list)
|
|||
|
return tt_list
|
|||
|
|