Building a Full-Text Search Engine for fastai YouTube channel

fastai
deeplearning
full-text search
SQLite
Streamlit
Author

Francisco Mussari

Published

December 4, 2022

Part 2. Deploying the web app to Streamlit

Introduction

In this post we are going to take the SQLite database with the searchable index we created in Part 1 and use it as the search engine for a web app we are going to deploy to Streamlit.

So we are going to be able to search over the entire fastai channel.

The Python file was created with a Jupyter Notebook using nbdev.

The web app is just like this one: Full-Text Search Engine for fastai Youtube Chanel

References

If you want to get deeper, I encourage you to read these articles:

Creating app.py

#|export
import streamlit as st
import sqlite3
#| export
db_path = '/mnt/m/datamatica/posts/full-text-search-fastai-youtube-channel/'

try:
    db = sqlite3.connect(db_path + 'fastai_yt.db')
except:
    db = sqlite3.connect('fastai_yt.db')

cur = db.cursor()
#| export
playlist = cur.execute('SELECT playlist_id, playlist_name FROM playlist').fetchall()
video = cur.execute('SELECT video_id, video_name FROM video').fetchall()
playlist = {p: n for p, n in playlist}
video = {p: n for p, n in video}
pl_sel = list(playlist.values())
pl_to_id = {v:k for k,v in playlist.items()}
#| export
st.title('Full-Text Search fastai Youtube Playlists')

# https://discuss.streamlit.io/t/select-all-on-a-streamlit-multiselect/9799/2

all_options = st.checkbox("Select all Playlists",
    key='sel_all', value=True)

container = st.container()
if all_options:
    sel_options = container.multiselect("Select one or more Playlist(s):", 
                                        pl_sel, disabled=True)
else:
    sel_options = container.multiselect("Select one or more Playlist(s):", 
                                        pl_sel, pl_sel)

if all_options: options = list(playlist.values())
else: options = sel_options

st.write('Selected playlist(s):', options)
#| export
def get_query(q, limit):
    
    search_in = 'text'
    
    if not( len(options)==len(pl_sel) or len(options)==0 ):
        search_in = 'transcriptions_fts'
        q_pl = '(playlist_id: '
        for pl in options:
            end = ' OR ' if pl != options[-1] else ')'
            q_pl = q_pl + f'"{pl_to_id[pl]}"' + end
        
        q = f"(text: {q}) AND " + q_pl

    query = f"""
    SELECT *, HIGHLIGHT(transcriptions_fts, 3, '[', ']')
    FROM transcriptions_fts WHERE {search_in} MATCH '{q}' ORDER BY rank
    LIMIT "{limit}" 
    """
    return query

with st.form("Input"):
    queryText = st.text_area("Search query: \ne.g. «fastc*», «fastcore OR paral*»", height=1, max_chars=None)
    limit_val = st.slider("Number of results:", min_value=5, max_value=20)
    btnResult = st.form_submit_button('Search')
    
if btnResult:
    if not queryText:
        st.text('Please enter a search query.')
    else:
        try:
            st.text('Search query generated:')
            # run query
            st.write(get_query(queryText, limit_val).replace('*', '\*'))
            res = cur.execute(get_query(q=queryText, limit=limit_val)).fetchall()
            st.text('Search results (click to go to YouTube):')

            res_md = ('  \n  '.join(['  \n  '.join([
                f"{i}.- Playlist: `{playlist[each[0]]}`, Video: `{video[each[1]]}`", 
                f"Caption: '...[{each[4].replace('[','**'
                ).replace(']','**')}](https://youtu.be/{each[1]}?t={str(int(each[2]))})...'", 
                '\n'])
                for i, each in enumerate(res)
            ]))

            st.markdown(res_md)
        except:
            st.text('Invalid search query.')
#| hide
from nbdev.export import nb_export
nb_export('_Deploy_Search_Engine_Streamlit.ipynb', lib_path='.', name='app')