backup_app_20Dec2022.txt

import streamlit as st
import os
import pickle
import pandas as pd
from PIL import Image
from rdkit import Chem
from rdkit.Chem import AllChem, Draw
from padelpy import padeldescriptor

# Page configuration
st.set_page_config(
  page_title='PARP1pred',
  page_icon='💊',
  initial_sidebar_state='expanded')

# Session state
if 'smiles_input' not in st.session_state:
  st.session_state.smiles_input = ''

if os.path.isfile('molecule.smi'):
  os.remove('molecule.smi') 
  
st.sidebar.title('💊 PARP1pred')

# Input SMILES
st.sidebar.subheader('Input molecule')

def insert_example_smiles():
    st.session_state.smiles_input = 'O=C(c1cc(Cc2n[nH]c(=O)c3ccccc23)ccc1F)N1CCN(C(=O)C2CC2)CC1'
def clear_smiles():
    st.session_state.smiles_input = ''

smiles_txt = st.sidebar.text_input('Enter SMILES notation', st.session_state.smiles_input)
st.sidebar.button('Example input', on_click=insert_example_smiles)
st.sidebar.button('Clear input', on_click=clear_smiles)


# Default page (loading for the first time)
if st.session_state.smiles_input == '':
  st.subheader('Welcome to the PARP1pred app!')
  st.info('PARP1pred allow users to predict whether a query molecule is active/inactive towards the PARP1 target protein.')
  
  st.warning('Enter SMILES notation in the sidebar to proceed', icon='👈')
  
  with st.expander('About this app'):
    st.markdown('''
      #### What is PARP1 and its therapeutic importance
      Poly (ADP-ribose) polymerase-1 (PARP-1) is an enzyme that catalyzes the ADP-ribosylation of a specific protein and plays a vital role in DNA repair. It has become an attractive target as inhibition of PARP-1 causes a toxic accumulation of DNA double strand breaks in cancer cells, particularly those with BRCA1/2 deficiency, which are found in breast, ovarian, prostate, and pancreatic cancers.

      #### Dataset
      In our work, we retrieved a human PARP-1 biological dataset from the ChEMBL database. The data was curated, and we received a non-redundant set of 2,018 PARP-1 inhibitors, which were divided into 1,720 active and 298 inactive compounds.

      #### Model performance
      We selected PubChem as a molecular fingerprint and used a random forest with an oversampling approach to construct the best model. The Matthews correlation coefficients in training, cross-validation, and test sets were 1.00, 0.96, and 0.74, respectively.

      #### Python libraries
      This app is based on the following Python libraries:
      - `streamlit`
      - `pandas`
      - `rdkit`
      - `padelpy`

      #### Citing us
      T. Lerksuthirat, S. Chitphuk, W. Stitchantrakul, D. Dejsuphong, A.A. Malik, C. Nantasenamat, PARP1PRED: A web server for screening the bioactivity of inhibitors against DNA repair enzyme PARP-1, ***EXCLI Journal*** (2023).
      ''')
  
  coverimage = Image.open('PARP1pred.jpg')
  st.image(coverimage)
  
else:
  st.subheader('⚛️ Input molecule:')
  with st.expander('Show SMILES'):
    #st.write('**SMILES**')
    st.text(smiles_txt)
  
  with st.expander('Show chemical structures'):
    #st.write('**Chemical structure**')
    smi = Chem.MolFromSmiles(smiles_txt)
    Chem.Draw.MolToFile(smi, 'molecule.png', width=900)
    mol_image = Image.open('molecule.png')
    st.image(mol_image)

# Input SMILES saved to file
f = open('molecule.smi', 'w')
f.write(f'{smiles_txt}\tmol_001')
f.close()


# Compute PADEL descriptors
if st.session_state.smiles_input != '':
  st.subheader('🔢 Descriptors')
  if os.path.isfile('molecule.smi'):
    padeldescriptor(mol_dir='molecule.smi', 
                    d_file='descriptors.csv',
                    descriptortypes='data/PubchemFingerprinter.xml', 
                    detectaromaticity=True,
                    standardizenitro=True,
                    standardizetautomers=True,
                    threads=2,
                    removesalt=True,
                    log=True,
                    fingerprints=True)

  descriptors = pd.read_csv('descriptors.csv')
  descriptors.drop('Name', axis=1, inplace=True)
  
  with st.expander('Show full set of descriptors as calculated for query molecule'):
    #st.write('**Full set of descriptors (calculated for query molecule)**')
    st.write(descriptors)
    st.write(descriptors.shape)
  

# Load descriptor subset used in trained model
if st.session_state.smiles_input != '':
  model = pickle.load(open('data/oversampling_PubChem_RandomForestClassifier.pkl', 'rb'))
  pubchem_subset = model.feature_names_in_

  query_desc_1 = descriptors.columns.difference(pubchem_subset)
  query_desc_2 = descriptors.drop(query_desc_1, axis=1)

  with st.expander('Show subset of descriptors as used in trained model'):
    #st.write('**Subset of descriptors (used in trained model)**')
    st.write(query_desc_2)
    st.write(query_desc_2.shape)

  
# Read in saved classification model
if st.session_state.smiles_input != '':
  st.subheader('🤖 Predictions')
  pred = int(model.predict(query_desc_2))
  if pred == 0:
    st.error('Inactive')
  if pred == 1:
    st.success('Active')