mirror of
https://git.phreedom.club/localhost_frssoft/FMN_bot.git
synced 2025-04-19 17:16:30 +02:00
106 lines
3.7 KiB
Python
106 lines
3.7 KiB
Python
import sqlite3
|
||
import gzip
|
||
|
||
conn = sqlite3.connect("imdb_titles.sqlite")
|
||
c = conn.cursor()
|
||
|
||
c.execute(
|
||
'''CREATE TABLE IF NOT EXISTS titles(tt_id INTEGER UNIQUE, type VARCHAR (50), original_name VARCHAR (500) DEFAULT NULL, ru_name VARCHAR (500) DEFAULT NULL, year INTEGER DEFAULT NULL)''')
|
||
c.execute("PRAGMA synchronous = OFF")
|
||
c.execute("PRAGMA optimize")
|
||
conn.commit()
|
||
|
||
|
||
def convert_tsv_to_db(title_basics_tsv):
|
||
'''Конвертирование основного датасета в sqlite базу, выполняется долго (~5 минут)'''
|
||
with gzip.open(title_basics_tsv, mode='rt') as file:
|
||
write_dataset = []
|
||
counter = 0
|
||
chunk = 1000
|
||
progress_counter = 0
|
||
for line in file:
|
||
line = line.split("\t")
|
||
try:
|
||
tt_id = int(line[0].split("tt")[1])
|
||
tt_type = line[1]
|
||
original_name = line[3]
|
||
ru_name = None
|
||
year = line[5]
|
||
if year.startswith(r"\N"):
|
||
year = None
|
||
else:
|
||
year = int(year)
|
||
|
||
if tt_type not in ("movie", "video"):
|
||
original_name = None
|
||
year = None
|
||
|
||
write_dataset.append((tt_id, tt_type, original_name, ru_name, year))
|
||
counter += 1
|
||
if counter >= chunk:
|
||
c.executemany("INSERT OR REPLACE INTO titles(tt_id, type, original_name, ru_name, year) VALUES (?, ?, ?, ?, ?)", write_dataset)
|
||
write_dataset = []
|
||
counter = 0
|
||
progress_counter += chunk
|
||
print(f'Обработано: {progress_counter}')
|
||
except Exception as E:
|
||
print(E)
|
||
pass
|
||
conn.commit()
|
||
|
||
def extract_ru_locale_from_tsv(title_akas_tsv):
|
||
'''Конвертирование датасета с локализованными названиями и последующее добавление в базу'''
|
||
with gzip.open(title_akas_tsv, mode='rt') as file:
|
||
ru_name_writer = []
|
||
counter = 0
|
||
for line in file:
|
||
line = line.split("\t")
|
||
try:
|
||
tt_region = line[3]
|
||
if tt_region != "RU":
|
||
continue
|
||
|
||
tt_id = int(line[0].split("tt")[1])
|
||
tt_type = c.execute(f"SELECT type FROM titles WHERE tt_id={tt_id}").fetchone()[0]
|
||
if tt_type not in ("movie", "video"):
|
||
continue
|
||
ru_name = line[2]
|
||
ru_name_writer.append((ru_name, tt_id))
|
||
counter += 1
|
||
print(f'Обработано ru_name: {counter}')
|
||
|
||
except Exception as E:
|
||
print(E)
|
||
pass
|
||
|
||
c.executemany("UPDATE titles SET ru_name = ? WHERE tt_id = ?", ru_name_writer)
|
||
conn.commit()
|
||
|
||
def convert_datasets_to_db():
|
||
print("Converting tsv dataset to sqlite...")
|
||
convert_tsv_to_db("title.basics.tsv.gz")
|
||
print("Unpack ru locale...")
|
||
extract_ru_locale_from_tsv("title.akas.tsv.gz")
|
||
|
||
|
||
def get_title_by_id(films_ids=list):
|
||
tt_list = []
|
||
for i in films_ids:
|
||
tt_film = c.execute("SELECT * FROM titles WHERE tt_id = (?)", (i,)).fetchone()
|
||
if tt_film:
|
||
tt_list.append(tt_film)
|
||
if tt_list != []:
|
||
return tt_list
|
||
|
||
|
||
def get_title_by_names_and_year(film_names=list):
|
||
tt_list = []
|
||
for i in film_names:
|
||
tt_film = c.execute('''SELECT * FROM titles
|
||
WHERE (original_name = (?) OR ru_name = (?))
|
||
AND year = (?)''', i).fetchone()
|
||
if tt_film:
|
||
tt_list.append(tt_film)
|
||
if tt_list != []:
|
||
return tt_list
|