The following article covers all three versions: serial, shared-memory, and distributed-memory, with a little more emphasis on users aspects:
@article{li05,                                                                  
    AUTHOR = {Xiaoye S. Li},                                                    
    TITLE = {An Overview of {SuperLU}: Algorithms, Implementation,              
             and User Interface},                                               
    journal = toms,                                                             
    volume = {31},                                                              
    number = {3},                                                               
    month = {September},                                                        
    year = {2005},                                                              
    pages = {302-325},                                                          
}
The Users Guide:
@techreport{superlu_ug99,
     AUTHOR = {X.S. Li and J.W. Demmel and J.R. Gilbert and iL. Grigori and M. Shao and
          I. Yamazaki},
     INSTITUTION = {Lawrence Berkeley National Laboratory},
     NUMBER = {LBNL-44289},
     TITLE = {{SuperLU Users' Guide}},
     MONTH = {September},
     YEAR = {1999},
     NOTE = "\url{https://portal.nersc.gov/project/sparse/superlu/ug.pdf} 
             Last update: June 2018"
}

@article{superlu99, 
    author  = {James W. Demmel and Stanley C. Eisenstat and 
               John R. Gilbert and Xiaoye S. Li and Joseph W. H. Liu},
    title   = {A supernodal approach to sparse partial pivoting},
    journal = {SIAM J. Matrix Analysis and Applications}, 
    year    = {1999}, 
    volume  = {20},
    number  = {3},
    pages   = {720-755}
}
@article{lishao10,
     AUTHOR = {Xiaoye S. Li and Meiyue Shao},
     TITLE = {A Supernodal approach to incomplete {LU} factorization
              with partial pivoting},
     JOURNAL = "ACM Trans.\ Mathematical Software",
     YEAR = 2010,
     VOLUME = {37},
     NUMBER = {4},
}
@article{superlu_smp99, 
    author  = {James W. Demmel and John R. Gilbert and Xiaoye S. Li},
    title   = {An Asynchronous Parallel Supernodal Algorithm for 
                 Sparse Gaussian Elimination},
    journal = {SIAM J. Matrix Analysis and Applications},
    volume  = {20},
    number  = {4},		  
    pages   = {915-952},
    year    = {1999} 
}
@article{lidemmel03,
     author = {Xiaoye S. Li and James W. Demmel},
     title = {{SuperLU_DIST}: A Scalable Distributed-Memory Sparse Direct
              Solver for Unsymmetric Linear Systems},
     journal = {ACM Trans. Mathematical Software},
     month = {June},
     volume = {29},
     number = {2},
     pages = {110-140},
     year = 2003
}
@article{grigoridemmelli07,                                                     
  Author = {Laura Grigori and James W. Demmel and Xiaoye S. Li},                
  Title = {Parallel Symbolic Factorization for Sparse {LU} with                 
      Static Pivoting},                                                         
  journal = {SIAM J. Scientific Computing},
  volume = 29,
  number = 3,
  pages = {1289-1314},
  year = 2007
}
  @article{hwpm,
  author = {A. Azad and A. Buluc and X.S. Li and X. Wang and J. Langguth},
  title = {{A Distributed-Memory Algorithm for Computing a Heavy-Weight Perfect Matching
  on Bipartite Graphs}},
  journal = sisc,
  volume = {42},
  number = {4},
  pages = {C143-C168},
  year = 2020
  }