@article {1773746, title = {Streamlining statistical reproducibility: NHLBI ORCHID clinical trial results reproduction}, journal = {JAMIA Open}, volume = {5}, number = {1}, year = {2022}, abstract = {Reproducibility in medical research has been a long-standing issue. More recently, the COVID-19 pandemic has publicly underlined this fact as the retraction of several studies reached out to general media audiences. A significant number of these retractions occurred after in-depth scrutiny of the methodology and results by the scientific community. Consequently, these retractions have undermined confidence in the peer-review process, which is not considered sufficiently reliable to generate trust in the published results. This partly stems from opacity in published results, the practical implementation of the statistical analysis often remaining undisclosed. We present a workflow that uses a combination of informatics tools to foster statistical reproducibility: an open-source programming language, Jupyter Notebook, cloud-based data repository, and an application programming interface can streamline an analysis and help to kick-start new analyses. We illustrate this principle by (1) reproducing the results of the ORCHID clinical trial, which evaluated the efficacy of hydroxychloroquine in COVID-19 patients, and (2) expanding on the analyses conducted in the original trial by investigating the association of premedication with biological laboratory results. Such workflows will be encouraged for future publications from National Heart, Lung, and Blood Institute-funded studies.}, url = {https://doi.org/10.1093/jamiaopen/ooac001}, author = {Serret-Larmande, Arnaud and Jonathan R Kaltman and Avillach, Paul} } @article {1597919, title = {Medication Use in the Management of Comorbidities Among Individuals With Autism Spectrum Disorder From a Large Nationwide Insurance Database.}, journal = {JAMA Pediatrics}, year = {2021}, abstract = { Abstract Importance: Although there is no pharmacological treatment for autism spectrum disorder (ASD) itself, behavioral and pharmacological therapies have been used to address its symptoms and common comorbidities. A better understanding of the medications used to manage comorbid conditions in this growing population is critical; however, most previous efforts have been limited in size, duration, and lack of broad representation. Objective: To use a nationally representative database to uncover trends in the prevalence of co-occurring conditions and medication use in the management of symptoms and comorbidities over time among US individuals with ASD. Design, setting, and participants: This retrospective, population-based cohort study mined a nationwide, managed health plan claims database containing more than 86 million unique members. Data from January 1, 2014, to December 31, 2019, were used to analyze prescription frequency and diagnoses of comorbidities. A total of 26 722 individuals with ASD who had been prescribed at least 1 of 24 medications most commonly prescribed to treat ASD symptoms or comorbidities during the 6-year study period were included in the analysis. Exposures: Diagnosis codes for ASD based on International Classification of Diseases, Ninth Revision, and International Statistical Classification of Diseases and Related Health Problems, Tenth Revision. Main outcomes and measures: Quantitative estimates of prescription frequency for the 24 most commonly prescribed medications among the study cohort and the most common comorbidities associated with each medication in this population. Results: Among the 26 722 individuals with ASD included in the analysis (77.7\% male; mean [SD] age, 14.45 [9.40] years), polypharmacy was common, ranging from 28.6\% to 31.5\%. Individuals{\textquoteright} prescription regimens changed frequently within medication classes, rather than between classes. The prescription frequency of a specific medication varied considerably, depending on the coexisting diagnosis of a given comorbidity. Of the 24 medications assessed, 15 were associated with at least a 15\% prevalence of a mood disorder, and 11 were associated with at least a 15\% prevalence of attention-deficit/hyperactivity disorder. For patients taking antipsychotics, the 2 most common comorbidities were combined type attention-deficit/hyperactivity disorder (11.6\%-17.8\%) and anxiety disorder (13.1\%-30.1\%). Conclusions and relevance: This study demonstrated considerable variability and transiency in the use of prescription medications by US clinicians to manage symptoms and comorbidities associated with ASD. These findings support the importance of early and ongoing surveillance of patients with ASD and co-occurring conditions and offer clinicians insight on the targeted therapies most commonly used to manage co-occurring conditions. Future research and policy efforts are critical to assess the extent to which pharmacological management of comorbidities affects quality of life and functioning in patients with ASD while continuing to optimize clinical guidelines, to ensure effective care for this growing population. }, url = {https://pubmed.ncbi.nlm.nih.gov/34097007/}, author = {Feroe, AG and Uppal, N and Guti{\'e}rrez-Sacrist{\'a}n, A and Mousavi, S and Greenspun, P and Surati, R and Kohane, IS and Avillach, P} } @article {1597914, title = {International Analysis of Electronic Health Records of Children and Youth Hospitalized With COVID-19 Infection in 6 Countries}, journal = {JAMA Network Open}, year = {2021}, abstract = { Abstract Importance:\ Additional sources of pediatric epidemiological and clinical data are needed to efficiently study COVID-19 in children and youth and inform infection prevention and clinical treatment of pediatric patients. Objective:\ To describe international hospitalization trends and key epidemiological and clinical features of children and youth with COVID-19. Design, setting, and participants:\ This retrospective cohort study included pediatric patients hospitalized between February 2 and October 10, 2020. Patient-level electronic health record (EHR) data were collected across 27 hospitals in France, Germany, Spain, Singapore, the UK, and the US. Patients younger than 21 years who tested positive for COVID-19 and were hospitalized at an institution participating in the Consortium for Clinical Characterization of COVID-19 by EHR were included in the study. Main outcomes and measures:\ Patient characteristics, clinical features, and medication use. Results:\ There were 347 males (52\%; 95\% CI, 48.5-55.3) and 324 females (48\%; 95\% CI, 44.4-51.3) in this study{\textquoteright}s cohort. There was a bimodal age distribution, with the greatest proportion of patients in the 0- to 2-year (199 patients [30\%]) and 12- to 17-year (170 patients [25\%]) age range. Trends in hospitalizations for 671 children and youth found discrete surges with variable timing across 6 countries. Data from this cohort mirrored national-level pediatric hospitalization trends for most countries with available data, with peaks in hospitalizations during the initial spring surge occurring within 23 days in the national-level and 4CE data. A total of 27 364 laboratory values for 16 laboratory tests were analyzed, with mean values indicating elevations in markers of inflammation (C-reactive protein, 83 mg/L; 95\% CI, 53-112 mg/L; ferritin, 417 ng/mL; 95\% CI, 228-607 ng/mL; and procalcitonin, 1.45 ng/mL; 95\% CI, 0.13-2.77 ng/mL). Abnormalities in coagulation were also evident (D-dimer, 0.78 ug/mL; 95\% CI, 0.35-1.21 ug/mL; and fibrinogen, 477 mg/dL; 95\% CI, 385-569 mg/dL). Cardiac troponin, when checked (n = 59), was elevated (0.032 ng/mL; 95\% CI, 0.000-0.080 ng/mL). Common complications included cardiac arrhythmias (15.0\%; 95\% CI, 8.1\%-21.7\%), viral pneumonia (13.3\%; 95\% CI, 6.5\%-20.1\%), and respiratory failure (10.5\%; 95\% CI, 5.8\%-15.3\%). Few children were treated with COVID-19-directed medications. Conclusions and relevance:\ This study of EHRs of children and youth hospitalized for COVID-19 in 6 countries demonstrated variability in hospitalization trends across countries and identified common complications and laboratory abnormalities in children and youth with COVID-19 infection. Large-scale informatics-based approaches to integrate and analyze data across health care systems complement methods of disease surveillance and advance understanding of epidemiological and clinical features associated with COVID-19 in children and youth. \  }, url = {https://pubmed.ncbi.nlm.nih.gov/34115127/}, author = {Bourgeois, FT and Guti{\'e}rrez-Sacrist{\'a}n, A and Keller, MS and Liu, M and Hong, C and Bonzel, CL and Tan, ALM and Aronow, BJ and Boeker, M and Booth, J and Cruz, Rojo J and Devkota, B and Garc{\'\i}a Barrio, N and Geva, A and Hanauer, DA and Hutch, MR and Issitt, RW and Klann, JG and Luo,Y. and Mandl, KD and Mao, C and Moal, B and Moshal, KL and Murphy, SN and Neuraz, A and Ngiam, KY and Omenn, GS and Patel, LP and Jim{\'e}nez, MP and Sebire, NJ and Balazote, PS and Serret-Larmande, A and South, AM and Spiridou, A and Taylor, D. M. and Tippmann, P and Visweswaran, S and Weber, GM and Kohane, IS and Cai, T. and Avillach, P} } @article {1527677, title = {De novo mutations across 1,465 diverse genomes reveal mutational insights and reductions in the Amish founder populations}, journal = {Proc Natl Acad Sci U S A}, volume = {117}, number = {5}, year = {2020}, pages = {2560-2569}, abstract = { De novo mutations (DNMs), or mutations that appear in an individual despite not being seen in their parents, are an important source of genetic variation whose impact is relevant to studies of human evolution, genetics, and disease. Utilizing high-coverage whole-genome sequencing data as part of the Trans-Omics for Precision Medicine (TOPMed) Program, we called 93,325 single-nucleotide DNMs across 1,465 trios from an array of diverse human populations, and used them to directly estimate and analyze DNM counts, rates, and spectra. We find a significant positive correlation between local recombination rate and local DNM rate, and that DNM rate explains a substantial portion (8.98 to 34.92\%, depending on the model) of the genome-wide variation in population-level genetic variation from 41K unrelated TOPMed samples. Genome-wide heterozygosity does correlate with DNM rate, but only explains \<1\% of variation. While we are underpowered to see small differences, we do not find significant differences in DNM rate between individuals of European, African, and Latino ancestry, nor across ancestrally distinct segments within admixed individuals. However, we did find significantly fewer DNMs in Amish individuals, even when compared with other Europeans, and even after accounting for parental age and sequencing center. Specifically, we found significant reductions in the number of C{\textrightarrow}A and T{\textrightarrow}C mutations in the Amish, which seem to underpin their overall reduction in DNMs. Finally, we calculated near-zero estimates of narrow sense heritability (h\ 2), which suggest that variation in DNM rate is significantly shaped by nonadditive genetic effects and the environment. Keywords:\ Amish; de novo mutations; diversity; mutation rate; recombination. }, url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7007577/}, author = {Kessler, Michael D and Loesch, Douglas P and Perry, James A and Heard-Costa, Nancy L and Taliun, Daniel and Cade, Brian E and Wang, Heming and Daya, Michelle and Ziniti, John and Datta, Soma and Celedon, Juan C and Soto-Quiros, Manuel E and Avila, Lydiana and Weiss, Scott T and Barnes, Kathleen and Redline, Susan S and Vasan, Ramachandran S and Johnson, Andrew D and Mathias, Rasika A and Hernandez, Ryan and Wilson, James G and Nickerson, Deborah A and Abecasis, Goncolo and Browning, Sharon R and Zollner, Sebastian and O{\textquoteright}Connell, Jeffrey R and Mitchell, Braxton D and NHLBI Trans-Omics for Precision Medicine Consortium and TOPMed Population Genetics Working Group and O{\textquoteright}Connor, Timothy D} } @article {1527674, title = {Treatment pathway analysis of newly diagnosed dementia patients in four electronic health record databases in Europe}, journal = {Soc Psychiatry Psychiatry Epidemiol. }, year = {2020}, abstract = { Purpose:\ Real-world studies to describe the use of first, second and third line therapies for the management and symptomatic treatment of dementia are lacking. This retrospective cohort study describes the first-, second- and third-line therapies used for the management and symptomatic treatment of dementia, and in particular Alzheimer{\textquoteright}s Disease. Methods:\ Medical records of patients with newly diagnosed dementia between 1997 and 2017 were collected using four databases from the UK, Denmark, Italy and the Netherlands. Results:\ We identified 191,933 newly diagnosed dementia patients in the four databases between 1997 and 2017 with 39,836 (IPCI (NL): 3281, HSD (IT): 1601, AUH (DK): 4474, THIN (UK): 30,480) fulfilling the inclusion criteria, and of these, 21,131 had received a specific diagnosis of Alzheimer{\textquoteright}s disease. The most common first line therapy initiated within a year ({\textpm} 365 days) of diagnosis were Acetylcholinesterase inhibitors, namely rivastigmine in IPCI, donepezil in HSD and the THIN and the N-methyl-D-aspartate blocker memantine in AUH. Conclusion:\ We provide a real-world insight into the heterogeneous management and treatment pathways of newly diagnosed dementia patients and a subset of Alzheimer{\textquoteright}s Disease patients from across Europe. Keywords:\ Alzheimer{\textquoteright}s disease; Dementia; Epidemiology; Real-world data. }, url = {https://link.springer.com/article/10.1007\%2Fs00127-020-01872-2}, author = {James, Glen and Collin, Estelle and Lawrance, Marcus and Mueller, Achim and Podhorna, Jana and Zaremba-Pechmann, Liliana and Rijnbeek, Peter and van der Lei, Johan and Avillach, Paul and Pederson, Lars and Ansell, David and Pasqua, Alessandro and Mosseveld, Mees and Solene Grosdidier and Gungabissoon, Usha and Egger, Peter and Stewart, Robert and Celis-Morales, Carlos and Alexander, Myriam and Novak, Gerald and Gordon, Mark Forest} } @article {1527672, title = {A Semi-Automated Approach for Multilingual Terminology Matching: Mapping the French Version of the ICD-10 to the ICD-10 CM}, journal = {Stud Health Technol Inform}, year = {2020}, abstract = { The aim of this study was to develop a simple method to map the French International Statistical Classification of Diseases and Related Health Problems, 10th revision (ICD-10) with the International Classification of Diseases, 10th Revision, Clinical Modification (ICD-10 CM). We sought to map these terminologies forward (ICD-10 to ICD-10 CM) and backward (ICD-10 CM to ICD-10) and to assess the accuracy of these two mappings. We used several terminology resources such as the Unified Medical Language System (UMLS) Metathesaurus, Bioportal, the latest version available of the French ICD-10 and several official mapping files between different versions of the ICD-10. We first retrieved existing partial mapping between the ICD-10 and the ICD-10 CM. Then, we automatically matched the ICD-10 with the ICD-10-CM, using our different reference mapping files. Finally, we used manual review and natural language processing (NLP) to match labels between the two terminologies. We assessed the accuracy of both methods with a manual review of a random dataset from the results files. The overall matching was between 94.2 and 100\%. The backward mapping was better than the forward one, especially regarding exact matches. In both cases, the NLP step was highly accurate. When there are no available experts from the ontology or NLP fields for multi-lingual ontology matching, this simple approach enables secondary reuse of Electronic Health Records (EHR) and billing data for research purposes in an international context. Keywords:\ Clinical terminologies; ICD-10; Interoperability; Multilingual matching. }, url = {https://pubmed.ncbi.nlm.nih.gov/32570338/}, author = {Sylvestre, Emmanuelle and Bouzille, Guillaume and McDuffie, Michael and Chazard, Emmanuel and Avillach, Paul and Cuggia, Marc} } @article {1527669, title = {Development and validation of a Paediatric Early Warning Score for use in the emergency department: a multicentre study.}, journal = {Lancet Child Adolesc Health}, year = {2020}, abstract = { Background:\ Paediatric Early Warning Scores (PEWSs) are being used increasingly in hospital wards to identify children at risk of clinical deterioration, but few scores exist that were designed for use in emergency care settings. To improve the prioritisation of children in the emergency department (ED), we developed and validated an ED-PEWS. Methods:\ The TrIAGE project is a prospective European observational study based on electronic health record data collected between Jan 1, 2012, and Nov 1, 2015, from five diverse EDs in four European countries (Netherlands, the UK, Austria, and Portugal). This study included data from all consecutive ED visits of children under age 16 years. The main outcome measure was a three-category reference standard (high, intermediate, low urgency) that was developed as part of the TrIAGE project as a proxy for true patient urgency. The ED-PEWS was developed based on an ordinal logistic regression model, with cross-validation by setting. After completing the study, we fully externally validated the ED-PEWS in an independent cohort of febrile children from a different ED (Greece). Findings:\ Of 119 209 children, 2007 (1{\textperiodcentered}7\%) were of high urgency and 29 127 (24{\textperiodcentered}4\%) of intermediate urgency, according to our reference standard. We developed an ED-PEWS consisting of age and the predictors heart rate, respiratory rate, oxygen saturation, consciousness, capillary refill time, and work of breathing. The ED-PEWS showed a cross-validated c-statistic of 0{\textperiodcentered}86 (95\% prediction interval 0{\textperiodcentered}82-0{\textperiodcentered}90) for high-urgency patients and 0{\textperiodcentered}67 (0{\textperiodcentered}61-0{\textperiodcentered}73) for high-urgency or intermediate-urgency patients. A cutoff of score of at least 15 was useful for identifying high-urgency patients with a specificity of 0{\textperiodcentered}90 (95\% CI 0{\textperiodcentered}87-0{\textperiodcentered}92) while a cutoff score of less than 6 was useful for identifying low-urgency patients with a sensitivity of 0{\textperiodcentered}83 (0{\textperiodcentered}81-0{\textperiodcentered}85). Interpretation:\ The proposed ED-PEWS can assist in identifying high-urgency and low-urgency patients in the ED, and improves prioritisation compared with existing PEWSs. Funding:\ Stichting de Drie Lichten, Stichting Sophia Kinderziekenhuis Fonds, and the European Union{\textquoteright}s Horizon 2020 research and innovation programme. }, url = {https://www.sciencedirect.com/science/article/abs/pii/S2352464220301395?via\%3Dihub}, author = {Zachariasse, Joany M and Nieboer, Daan and Maconochie, Ian K and Smit, Frank J and Alves, Claudio F and Greber-Platzer, Susanne and Tsolia, Maria N and Steyerberg, Ewout W and Avillach, Paul and van der Lei, Johan and Moll, Henriette A} } @article {1527664, title = {Scalability and cost-effectiveness analysis of whole genome-wide association studies on Google Cloud Platform and Amazon Web Services.}, journal = {JAMIA}, year = {2020}, abstract = { Abstract Objective:\ Advancements in human genomics have generated a surge of available data, fueling the growth and accessibility of databases for more comprehensive, in-depth genetic studies. Methods:\ We provide a straightforward and innovative methodology to optimize cloud configuration in order to conduct genome-wide association studies. We utilized Spark clusters on both Google Cloud Platform and Amazon Web Services, as well as Hail (http://doi.org/10.5281/zenodo.2646680) for analysis and exploration of genomic variants dataset. Results:\ Comparative evaluation of numerous cloud-based cluster configurations demonstrate a successful and unprecedented compromise between speed and cost for performing genome-wide association studies on 4 distinct whole-genome sequencing datasets. Results are consistent across the 2 cloud providers and could be highly useful for accelerating research in genetics. Conclusions:\ We present a timely piece for one of the most frequently asked questions when moving to the cloud: what is the trade-off between speed and cost? Keywords:\ cloud computing; distributed systems; genome-wide association study; whole genome. }, url = {https://academic.oup.com/jamia/advance-article/doi/10.1093/jamia/ocaa068/5876972}, author = {Krissaane, Ines and De Niz, Carlos and Guti{\'e}rrez-Sacrist{\'a}n, Alba and Korodi, Gabor and Ede, Nneka and Kumar, Ranjay and Lyons, Jessica and Manrai, Arjun and Chirag Patel and Kohane, Isaac and Avillach, Paul} } @article {1527662, title = {EHRtemporalVariability: delineating temporal data-set shifts in electronic health records.}, journal = {Gigascience}, year = {2020}, abstract = { Abstract Background:\ Temporal variability in health-care processes or protocols is intrinsic to medicine. Such variability can potentially introduce dataset shifts, a data quality issue when reusing electronic health records (EHRs) for secondary purposes. Temporal data-set shifts can present as trends, as well as abrupt or seasonal changes in the statistical distributions of data over time. The latter are particularly complicated to address in multimodal and highly coded data. These changes, if not delineated, can harm population and data-driven research, such as machine learning. Given that biomedical research repositories are increasingly being populated with large sets of historical data from EHRs, there is a need for specific software methods to help delineate temporal data-set shifts to ensure reliable data reuse. Results:\ EHRtemporalVariability is an open-source R package and Shiny app designed to explore and identify temporal data-set shifts. EHRtemporalVariability estimates the statistical distributions of coded and numerical data over time; projects their temporal evolution through non-parametric information geometric temporal plots; and enables the exploration of changes in variables through data temporal heat maps. We demonstrate the capability of EHRtemporalVariability to delineate data-set shifts in three impact case studies, one of which is available for reproducibility. Conclusions:\ EHRtemporalVariability enables the exploration and identification of data-set shifts, contributing to the broad examination and repurposing of large, longitudinal data sets. Our goal is to help ensure reliable data reuse for a wide range of biomedical data users. EHRtemporalVariability is designed for technical users who are programmatically utilizing the R package, as well as users who are not familiar with programming via the Shiny user interface.Availability: https://github.com/hms-dbmi/EHRtemporalVariability/Reproducible vignette: https://cran.r-project.org/web/packages/EHRtemporalVariability/vignettes/EHRtemporalVariability.htmlOnline demo: http://ehrtemporalvariability.upv.es/. Keywords:\ R package; claims data; data quality; data-set shifts; electronic health records; information geometry; research repositories; scientific data sets; temporal variability; visual analytics. }, url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7391413/}, author = {Saez, Carlos and Guti{\'e}rrez-Sacrist{\'a}n, Alba and Kohane, Isaac and Garcia-Gomez, Juan M and Avillach, Paul} } @article {1527660, title = {Experiences implementing scalable, containerized, cloud-based NLP for extracting biobank participant phenotypes at scale.}, journal = {JAMIA Open}, volume = {3}, number = {2}, year = {2020}, pages = {185-189}, abstract = { Abstract Objective:\ To develop scalable natural language processing (NLP) infrastructure for processing the free text in electronic health records (EHRs). Materials and methods:\ We extend the open-source Apache cTAKES NLP software with several standard technologies for scalability. We remove processing bottlenecks by monitoring component queue size. We process EHR free text for patients in the PrecisionLink Biobank at Boston Children{\textquoteright}s Hospital. The extracted concepts are made searchable via a web-based portal. Results:\ We processed over 1.2 million notes for over 8000 patients, extracting 154 million concepts. Our largest tested configuration processes over 1 million notes per day. Discussion:\ The unique information represented by extracted NLP concepts has great potential to provide a more complete picture of patient status. Conclusion:\ NLP large EHR document collections can be done efficiently, in service of high throughput phenotyping. }, url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7382623/}, author = {Miller, Timothy A and Avillach, Paul and Mandl, Kenneth D} } @article {1527659, title = {A multidimensional precision medicine approach identifies an autism subtype characterized by dyslipidemia.}, journal = {Nat Med}, year = {2020}, abstract = { Abstract The promise of precision medicine lies in data diversity. More than the sheer size of biomedical data, it is the layering of multiple data modalities, offering complementary perspectives, that is thought to enable the identification of patient subgroups with shared pathophysiology. In the present study, we use autism to test this notion. By combining healthcare claims, electronic health records, familial whole-exome sequences and neurodevelopmental gene expression patterns, we identified a subgroup of patients with dyslipidemia-associated autism. }, url = {https://pubmed.ncbi.nlm.nih.gov/32778826/}, author = {Yuan Luo and Eran, Alal and Palmer, Nathan and Avillach, Paul and Levy-Moonshine, Ami and Szolovitis, Peter and Kohane, Isaac} } @article {1499650, title = {Methotrexate and relative risk of dementia amongst patients with rheumatoid arthritis: a multi-national multi-database case-control study}, journal = {Alzheimers Res Ther}, volume = {12}, number = {1}, year = {2020}, month = {2020 Apr 06}, pages = {38}, abstract = {BACKGROUND: Inflammatory processes have been shown to play a role in dementia. To understand this role, we selected two anti-inflammatory drugs (methotrexate and sulfasalazine) to study their association with dementia risk. METHODS: A retrospective matched case-control study of patients over 50 with rheumatoid arthritis (486 dementia cases and 641 controls) who were identified from electronic health records in the UK, Spain, Denmark and the Netherlands. Conditional logistic regression models were fitted to estimate the risk of dementia. RESULTS: Prior methotrexate use was associated with a lower risk of dementia (OR 0.71, 95\% CI 0.52-0.98). Furthermore, methotrexate use with therapy longer than 4 years had the lowest risk of dementia (odds ratio 0.37, 95\% CI 0.17-0.79). Sulfasalazine use was not associated with dementia (odds ratio 0.88, 95\% CI 0.57-1.37). CONCLUSIONS: Further studies are still required to clarify the relationship between prior methotrexate use and duration as well as biological treatments with dementia risk.}, issn = {1758-9193}, doi = {10.1186/s13195-020-00606-5}, author = {Newby, Danielle and Prieto-Alhambra, Daniel and Duarte-Salles, Talita and Ansell, David and Pedersen, Lars and van der Lei, Johan and Mosseveld, Mees and Rijnbeek, Peter and James, Glen and Alexander, Myriam and Egger, Peter and Podhorna, Jana and Stewart, Robert and Perera, Gayan and Avillach, Paul and Grosdidier, Sol{\`e}ne and Lovestone, Simon and Nevado-Holgado, Alejo J} } @article {1499275, title = {GenoPheno: cataloging large-scale phenotypic and next-generation sequencing data within human datasets}, journal = {Brief Bioinform}, year = {2020}, month = {2020 Apr 06}, abstract = {Precision medicine promises to revolutionize treatment, shifting therapeutic approaches from the classical one-size-fits-all to those more tailored to the patient{\textquoteright}s individual genomic profile, lifestyle and environmental exposures. Yet, to advance precision medicine{\textquoteright}s main objective-ensuring the optimum diagnosis, treatment and prognosis for each individual-investigators need access to large-scale clinical and genomic data repositories. Despite the vast proliferation of these datasets, locating and obtaining access to many remains a challenge. We sought to provide an overview of available patient-level datasets that contain both genotypic data, obtained by next-generation sequencing, and phenotypic data-and to create a dynamic, online catalog for consultation, contribution and revision by the research community. Datasets included in this review conform to six specific inclusion parameters that are: (i) contain data from more than 500 human subjects; (ii) contain both genotypic and phenotypic data from the same subjects; (iii) include whole genome sequencing or whole exome sequencing data; (iv) include at least 100 recorded phenotypic variables per subject; (v) accessible through a website or collaboration with investigators and (vi) make access information available in English. Using these criteria, we identified 30 datasets, reviewed them and provided results in the release version of a catalog, which is publicly available through a dynamic Web application and on GitHub. Users can review as well as contribute new datasets for inclusion (Web: https://avillachlab.shinyapps.io/genophenocatalog/; GitHub: https://github.com/hms-dbmi/GenoPheno-CatalogShiny).}, issn = {1477-4054}, doi = {10.1093/bib/bbaa033}, author = {Guti{\'e}rrez-Sacrist{\'a}n, Alba and De Niz, Carlos and Kothari, Cartik and Kong, Sek Won and Mandl, Kenneth D and Avillach, Paul} } @article {1489916, title = {Correction: The Genomics Research and Innovation Network: creating an interoperable, federated, genomics learning system}, journal = {Genet Med}, volume = {22}, number = {2}, year = {2020}, month = {2020 Feb}, pages = {449}, abstract = {An amendment to this paper has been published and can be accessed via a link at the top of the paper.}, issn = {1530-0366}, doi = {10.1038/s41436-019-0711-y}, author = {Mandl, Kenneth D and Glauser, Tracy and Krantz, Ian D and Avillach, Paul and Bartels, Anna and Beggs, Alan H and Biswas, Sawona and Bourgeois, Florence T and Corsmo, Jeremy and Dauber, Andrew and Devkota, Batsal and Fleisher, Gary R and Heath, Allison P and Helbig, Ingo and Hirschhorn, Joel N and Kilbourn, Judson and Kong, Sek Won and Kornetsky, Susan and Majzoub, Joseph A and Marsolo, Keith and Martin, Lisa J and Nix, Jeremy and Schwarzhoff, Amy and Stedman, Jason and Strauss, Arnold and Sund, Kristen L and Taylor, Deanne M and White, Peter S and Marsh, Eric and Grimberg, Adda and Hawkes, Colin} } @article {1489919, title = {dbgap2x: an R package to explore and extract data from the database of Genotypes and Phenotypes (dbGaP)}, journal = {Bioinformatics}, volume = {36}, number = {4}, year = {2020}, month = {2020 Feb 15}, pages = {1305-1306}, abstract = {SUMMARY: Based on the Genomic Data Sharing Policy issued in August 2007, the National Institutes of Health (NIH) has supported several repositories such as the database of Genotypes and Phenotypes (dbGaP). dbGaP is an online repository that provides access to large-scale genetic and phenotypic datasets with more than 1000 studies. However, navigating the website and understanding the relationship between the studies are not easy tasks. Moreover, the decryption of the files is a complex procedure. In this study we propose the dbgap2x R package that covers a broad range of functions for searching dbGaP studies, exploring the characteristics of a study and easily decrypting the files from dbGaP. AVAILABILITY AND IMPLEMENTATION: dbgap2x is an R package with the code available at https://github.com/gversmee/dbgap2x. A containerized version including the package, a Jupyter server and with a Notebook example is available at https://hub.docker.com/r/gversmee/dbgap2x. SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.}, issn = {1367-4811}, doi = {10.1093/bioinformatics/btz680}, author = {Versm{\'e}e, Gr{\'e}goire and Versm{\'e}e, Laura and Dusenne, Mika{\"e}l and Jalali, Niloofar and Avillach, Paul} } @article {1489920, title = {The Genomics Research and Innovation Network: creating an interoperable, federated, genomics learning system}, journal = {Genet Med}, volume = {22}, number = {2}, year = {2020}, month = {2020 Feb}, pages = {371-380}, abstract = {PURPOSE: Clinicians and researchers must contextualize a patient{\textquoteright}s genetic variants against population-based references with detailed phenotyping. We sought to establish globally scalable technology, policy, and procedures for sharing biosamples and associated genomic and phenotypic data on broadly consented cohorts, across sites of care. METHODS: Three of the nation{\textquoteright}s leading children{\textquoteright}s hospitals launched the Genomic Research and Innovation Network (GRIN), with federated information technology infrastructure, harmonized biobanking protocols, and material transfer agreements. Pilot studies in epilepsy and short stature were completed to design and test the collaboration model. RESULTS: Harmonized, broadly consented institutional review board (IRB) protocols were approved and used for biobank enrollment, creating ever-expanding, compatible biobanks. An open source federated query infrastructure was established over genotype-phenotype databases at the three hospitals. Investigators securely access the GRIN platform for prep to research queries, receiving aggregate counts of patients with particular phenotypes or genotypes in each biobank. With proper approvals, de-identified data is exported to a shared analytic workspace. Investigators at all sites enthusiastically collaborated on the pilot studies, resulting in multiple publications. Investigators have also begun to successfully utilize the infrastructure for grant applications. CONCLUSIONS: The GRIN collaboration establishes the technology, policy, and procedures for a scalable genomic research network.}, issn = {1530-0366}, doi = {10.1038/s41436-019-0646-3}, author = {Mandl, Kenneth D and Glauser, Tracy and Krantz, Ian D and Avillach, Paul and Bartels, Anna and Beggs, Alan H and Biswas, Sawona and Bourgeois, Florence T and Corsmo, Jeremy and Dauber, Andrew and Devkota, Batsal and Fleisher, Gary R and Heath, Allison P and Helbig, Ingo and Hirschhorn, Joel N and Kilbourn, Judson and Kong, Sek Won and Kornetsky, Susan and Majzoub, Joseph A and Marsolo, Keith and Martin, Lisa J and Nix, Jeremy and Schwarzhoff, Amy and Stedman, Jason and Strauss, Arnold and Sund, Kristen L and Taylor, Deanne M and White, Peter S and Marsh, Eric and Grimberg, Adda and Hawkes, Colin} } @article {1489922, title = {Associations of antepartum suicidal behaviour with adverse infant and obstetric outcomes}, journal = {Paediatr Perinat Epidemiol}, volume = {33}, number = {2}, year = {2019}, month = {2019 03}, pages = {137-144}, abstract = {BACKGROUND: Relatively little is known about antepartum suicidal behaviour and pregnancy outcomes. We examined associations of antepartum suicidal behaviour, alone and in combination with psychiatric disorders, with adverse infant and obstetric outcomes. METHODS: We included 188~925 singleton livebirths from a retrospective cohort (1996-2016). Suicidal behaviour, psychiatric disorders, and outcomes were derived from electronic medical records. We performed multivariable logistic regressions with generalised estimating equations to estimate adjusted odds ratios (aOR) with 95\% confidence intervals (95\%CI). RESULTS: The prevalence of antepartum suicidal behaviour was 152.44 per 100~000 singleton livebirths. Nearly two-thirds (64.24\%) of women with suicidal behaviour also had psychiatric disorders. Compared to women without psychiatric disorders and suicidal behaviour, women with psychiatric disorders alone had 1.3-fold to 1.4-fold increased odds of delivering low birthweight or preterm infants and 1.2-fold increased odds of experiencing obstetric complications. Women with suicidal behaviour alone had increased odds of preterm labour (aOR~2.05, 95\% CI 1.16, 3.62). Women with both suicidal behaviour and psychiatric disorders had > twofold increased odds of delivering low birthweight (aOR~2.52, 95\% CI 1.40, 4.54), preterm birth (aOR~2.44, 95\% CI 1.63, 3.66), and low birthweight/preterm birth (aOR~2.30, 95\% CI 1.54, 3.44) infants; the odds of preterm labour (aOR~1.62, 95\% CI 1.06, 2.47), placental abruption (aOR~2.33, 95\% CI 1.20, 4.51), preterm rupture of membranes (aOR~1.63, 95\% CI 1.08, 2.46), and postpartum haemorrhage (aOR~1.93, 95\%CI 1.09, 3.40) were elevated. CONCLUSIONS: Antepartum suicidal behaviour, when co-occurring with psychiatric disorders, is associated with increased odds of adverse infant and obstetric outcomes. Future studies are warranted to understand the causal roles of suicidal behaviour and psychiatric disorders in pregnancy.}, keywords = {Adult, Female, Humans, Infant, Newborn, Obstetric Labor Complications, Odds Ratio, Pregnancy, Pregnancy Complications, Pregnancy Outcome, Psychotic Disorders, Retrospective Studies, Suicidal ideation, Suicide, Attempted, United States, Young Adult}, issn = {1365-3016}, doi = {10.1111/ppe.12535}, author = {Zhong, Qiu-Yue and Gelaye, Bizu and Karlson, Elizabeth W and Avillach, Paul and Smoller, Jordan W and Cai, Tianxi and Williams, Michelle A} } @article {1489923, title = {Comparison of variation in frequency for SNPs associated with asthma or liver disease between Estonia, HapMap populations and the 1000 genome project populations}, journal = {Int J Immunogenet}, volume = {46}, number = {2}, year = {2019}, month = {2019 Apr}, pages = {49-58}, abstract = {Allele-specific analyses to understand frequency differences across populations, particularly populations not well studied, are important to help identify variants that may have a functional effect on disease mechanisms and phenotypic predisposition, facilitating new Genome-Wide Association Studies (GWAS). We aimed to compare the allele frequency of 11 asthma-associated and 16 liver disease-associated single nucleotide polymorphisms (SNPs) between the Estonian, HapMap and 1000 genome project populations. When comparing EGCUT with HapMap populations, the largest difference in allele frequencies was observed with the Maasai population in Kinyawa, Kenya, with 12 SNP variants reporting statistical significance. Similarly, when comparing EGCUT with 1000 genomes project populations, the largest difference in allele frequencies was observed with pooled African populations with 22 SNP variants reporting statistical significance. For 11 asthma-associated and 16 liver disease-associated SNPs, Estonians are genetically similar to other European populations but significantly different from African populations. Understanding differences in genetic architecture between ethnic populations is important to facilitate new GWAS targeted at underserved ethnic groups to enable novel genetic findings to aid the development of new therapies to reduce morbidity and mortality.}, keywords = {Asthma, Estonia, Gene Frequency, Genetics, Population, Genome, Human, HapMap Project, Humans, Liver Diseases, Polymorphism, Single Nucleotide}, issn = {1744-313X}, doi = {10.1111/iji.12413}, author = {Reisberg, Sulev and Galwey, Nicholas and Avillach, Paul and Sahlqvist, Anna-Stina and Kolberg, Liis and M{\"a}gi, Reedik and Esko, T{\~o}nu and Vilo, Jaak and James, Glen} } @article {1489921, title = {An exploratory phenome wide association study linking asthma and liver disease genetic variants to electronic health records from the Estonian Biobank}, journal = {PLoS One}, volume = {14}, number = {4}, year = {2019}, month = {2019}, pages = {e0215026}, abstract = {The Estonian Biobank, governed by the Institute of Genomics at the University of Tartu (Biobank), has stored genetic material/DNA and continuously collected data since 2002 on a total of 52,274 individuals representing ~5\% of the Estonian adult population and is increasing. To explore the utility of data available in the Biobank, we conducted a phenome-wide association study (PheWAS) in two areas of interest to healthcare researchers; asthma and liver disease. We used 11 asthma and 13 liver disease-associated single nucleotide polymorphisms (SNPs), identified from published genome-wide association studies, to test our ability to detect established associations. We confirmed 2 asthma and 5 liver disease associated variants at nominal significance and directionally consistent with published results. We found 2 associations that were opposite to what was published before (rs4374383:AA increases risk of NASH/NAFLD, rs11597086 increases ALT level). Three SNP-diagnosis pairs passed the phenome-wide significance threshold: rs9273349 and E06 (thyroiditis, p = 5.50x10-8); rs9273349 and E10 (type-1 diabetes, p = 2.60x10-7); and rs2281135 and K76 (non-alcoholic liver diseases, including NAFLD, p = 4.10x10-7). We have validated our approach and confirmed the quality of the data for these conditions. Importantly, we demonstrate that the extensive amount of genetic and medical information from the Estonian Biobank can be successfully utilized for scientific research.}, keywords = {Adult, Asthma, Biological Specimen Banks, Electronic Health Records, Estonia, Female, Genetic Predisposition to Disease, Genome-Wide Association Study, Genotype, Humans, Liver Diseases, Male, Phenomics, Phenotype, Polymorphism, Single Nucleotide}, issn = {1932-6203}, doi = {10.1371/journal.pone.0215026}, author = {James, Glen and Reisberg, Sulev and Lepik, Kaido and Galwey, Nicholas and Avillach, Paul and Kolberg, Liis and M{\"a}gi, Reedik and Esko, T{\~o}nu and Alexander, Myriam and Waterworth, Dawn and Loomis, A Katrina and Vilo, Jaak} } @article {1489917, title = {FAIRshake: Toolkit to Evaluate the FAIRness of Research Digital Resources}, journal = {Cell Syst}, volume = {9}, number = {5}, year = {2019}, month = {2019 Nov 27}, pages = {417-421}, abstract = {As more digital resources are produced by the research community, it is becoming increasingly important to harmonize and organize them for synergistic utilization. The findable, accessible, interoperable, and reusable (FAIR) guiding principles have prompted many stakeholders to consider strategies for tackling this challenge. The FAIRshake toolkit was developed to enable the establishment of community-driven FAIR metrics and rubrics paired with manual and automated FAIR assessments. FAIR assessments are visualized as an insignia that can be embedded within digital-resources-hosting websites. Using FAIRshake, a variety of biomedical digital resources were manually and automatically evaluated for their level of FAIRness.}, issn = {2405-4720}, doi = {10.1016/j.cels.2019.09.011}, author = {Clarke, Daniel J B and Wang, Lily and Jones, Alex and Wojciechowicz, Megan L and Torre, Denis and Jagodnik, Kathleen M and Jenkins, Sherry L and McQuilton, Peter and Flamholz, Zachary and Silverstein, Moshe C and Schilder, Brian M and Robasky, Kimberly and Castillo, Claris and Idaszak, Ray and Ahalt, Stanley C and Williams, Jason and Schurer, Stephan and Cooper, Daniel J and de Miranda Azevedo, Ricardo and Klenk, Juergen A and Haendel, Melissa A and Nedzel, Jared and Avillach, Paul and Shimoyama, Mary E and Harris, Rayna M and Gamble, Meredith and Poten, Rudy and Charbonneau, Amanda L and Larkin, Jennie and Brown, C Titus and Bonazzi, Vivien R and Dumontier, Michel J and Susanna-Assunta Sansone and Ma{\textquoteright}ayan, Avi} } @article {1489918, title = {A framework for the investigation of rare genetic disorders in neuropsychiatry}, journal = {Nat Med}, volume = {25}, number = {10}, year = {2019}, month = {2019 10}, pages = {1477-1487}, abstract = {De novo and inherited rare genetic disorders (RGDs) are a major cause of human morbidity, frequently involving neuropsychiatric symptoms. Recent advances in genomic technologies and data sharing have revolutionized the identification and diagnosis of RGDs, presenting an opportunity to elucidate the mechanisms underlying neuropsychiatric disorders by investigating the pathophysiology of high-penetrance genetic risk factors. Here we seek out the best path forward for achieving these goals. We think future research will require consistent approaches across multiple RGDs and developmental stages, involving both the characterization of shared neuropsychiatric dimensions in humans and the identification of neurobiological commonalities in model systems. A coordinated and concerted effort across patients, families, researchers, clinicians and institutions, including rapid and broad sharing of data, is now needed to translate these discoveries into urgently needed therapies.}, keywords = {Genomics, Humans, Mental Disorders, Neuropsychiatry, Rare Diseases}, issn = {1546-170X}, doi = {10.1038/s41591-019-0581-5}, author = {Sanders, Stephan J and Sahin, Mustafa and Hostyk, Joseph and Thurm, Audrey and Jacquemont, Sebastien and Avillach, Paul and Douard, Elise and Martin, Christa L and Modi, Meera E and Moreno-De-Luca, Andres and Raznahan, Armin and Anticevic, Alan and Dolmetsch, Ricardo and Feng, Guoping and Geschwind, Daniel H and Glahn, David C and Goldstein, David B and Ledbetter, David H and Mulle, Jennifer G and Pasca, Sergiu P and Samaco, Rodney and Sebat, Jonathan and Pariser, Anne and Lehner, Thomas and Gur, Raquel E and Bearden, Carrie E} } @article {1489913, title = {Non-alcoholic fatty liver disease and risk of incident acute myocardial infarction and stroke: findings from matched cohort study of 18 million European adults}, journal = {BMJ}, volume = {367}, year = {2019}, month = {2019 10 08}, pages = {l5367}, abstract = {OBJECTIVE: To estimate the risk of acute myocardial infarction (AMI) or stroke in adults with non-alcoholic fatty liver disease (NAFLD) or non-alcoholic steatohepatitis (NASH). DESIGN: Matched cohort study. SETTING: Population based, electronic primary healthcare databases before 31 December 2015 from four European countries: Italy (n=1 542 672), Netherlands (n=2 225 925), Spain (n=5 488 397), and UK (n=12 695 046). PARTICIPANTS: 120 795 adults with a recorded diagnosis of NAFLD or NASH and no other liver diseases, matched at time of NAFLD diagnosis (index date) by age, sex, practice site, and visit, recorded at six months before or after the date of diagnosis, with up to 100 patients without NAFLD or NASH in the same database. MAIN OUTCOME MEASURES: Primary outcome was incident fatal or non-fatal AMI and ischaemic or unspecified stroke. Hazard ratios were estimated using Cox models and pooled across databases by random effect meta-analyses. RESULTS: 120 795 patients with recorded NAFLD or NASH diagnoses were identified with mean follow-up 2.1-5.5 years. After adjustment for age and smoking the pooled hazard ratio for AMI was 1.17 (95\% confidence interval 1.05 to 1.30; 1035 events in participants with NAFLD or NASH, 67 823 in matched controls). In a group with more complete data on risk factors (86 098 NAFLD and 4 664 988 matched controls), the hazard ratio for AMI after adjustment for systolic blood pressure, type 2 diabetes, total cholesterol level, statin use, and hypertension was 1.01 (0.91 to 1.12; 747 events in participants with NAFLD or NASH, 37 462 in matched controls). After adjustment for age and smoking status the pooled hazard ratio for stroke was 1.18 (1.11 to 1.24; 2187 events in participants with NAFLD or NASH, 134 001 in matched controls). In the group with more complete data on risk factors, the hazard ratio for stroke was 1.04 (0.99 to 1.09; 1666 events in participants with NAFLD, 83 882 in matched controls) after further adjustment for type 2 diabetes, systolic blood pressure, total cholesterol level, statin use, and hypertension. CONCLUSIONS: The diagnosis of NAFLD in current routine care of 17.7 million patient appears not to be associated with AMI or stroke risk after adjustment for established cardiovascular risk factors. Cardiovascular risk assessment in adults with a diagnosis of NAFLD is important but should be done in the same way as for the general population.}, keywords = {Adult, Aged, Cohort Studies, Databases, Factual, Female, Follow-Up Studies, Humans, Hypertension, Incidence, Italy, Liver, Male, Middle Aged, Myocardial Infarction, Netherlands, Non-alcoholic Fatty Liver Disease, Proportional Hazards Models, Risk Assessment, Risk Factors, Smoking, Spain, Stroke}, issn = {1756-1833}, doi = {10.1136/bmj.l5367}, author = {Alexander, Myriam and Loomis, A Katrina and van der Lei, Johan and Duarte-Salles, Talita and Prieto-Alhambra, Daniel and Ansell, David and Pasqua, Alessandro and Lapi, Francesco and Rijnbeek, Peter and Mosseveld, Mees and Avillach, Paul and Egger, Peter and Dhalwani, Nafeesa N and Kendrick, Stuart and Celis-Morales, Carlos and Waterworth, Dawn M and Alazawi, William and Sattar, Naveed} } @article {1489914, title = {Use of natural language processing in electronic medical records to identify pregnant women with suicidal behavior: towards a solution to the complex classification problem}, journal = {Eur J Epidemiol}, volume = {34}, number = {2}, year = {2019}, month = {2019 Feb}, pages = {153-162}, abstract = {We developed algorithms to identify pregnant women with suicidal behavior using information extracted from clinical notes by natural language processing (NLP) in electronic medical records. Using both codified data and NLP applied to unstructured clinical notes, we first screened pregnant women in Partners HealthCare for suicidal behavior. Psychiatrists manually reviewed clinical charts to identify relevant features for suicidal behavior and to obtain gold-standard labels. Using the adaptive elastic net, we developed algorithms to classify suicidal behavior. We then validated algorithms in an independent validation dataset. From 275,843 women with codes related to pregnancy or delivery, 9331 women screened positive for suicidal behavior by either codified data (N = 196) or NLP (N = 9,145). Using expert-curated features, our algorithm achieved an area under the curve of 0.83. By setting a positive predictive value comparable to that of diagnostic codes related to suicidal behavior (0.71), we obtained a sensitivity of 0.34, specificity of 0.96, and negative predictive value of 0.83. The algorithm identified 1423 pregnant women with suicidal behavior among 9331 women screened positive. Mining unstructured clinical notes using NLP resulted in a 11-fold increase in the number of pregnant women identified with suicidal behavior, as compared to solely reliance on diagnostic codes.}, keywords = {Algorithms, data mining, Electronic Health Records, Female, Humans, International Classification of Diseases, Natural Language Processing, Pregnancy, Pregnancy Complications, Suicidal ideation}, issn = {1573-7284}, doi = {10.1007/s10654-018-0470-0}, author = {Zhong, Qiu-Yue and Mittal, Leena P and Nathan, Margo D and Brown, Kara M and Knudson Gonz{\'a}lez, Deborah and Cai, Tianrun and Finan, Sean and Gelaye, Bizu and Avillach, Paul and Smoller, Jordan W and Karlson, Elizabeth W and Cai, Tianxi and Williams, Michelle A} } @article {1527682, title = {Rcupcake: an R package for querying and analyzing biomedical data through BD2K Pic-Sure restful API}, journal = {Bioinformatics}, year = {2018}, abstract = { Motivation:\ In the era of big data and precision medicine, the number of databases containing clinical, environmental, self-reported and biochemical variables is increasing exponentially. Enabling the experts to focus on their research questions rather than on computational data management, access and analysis is one of the most significant challenges nowadays. Results:\ We present Rcupcake, an R package that contains a variety of functions for leveraging different databases through the BD2K PIC-SURE RESTful API and facilitating its query, analysis and interpretation. The package offers a variety of analysis and visualization tools, including the study of the phenotype co-occurrence and prevalence, according to multiple layers of data, such as phenome, exposome or genome. Availability and implementation:\ The package is implemented in R and is available under Mozilla v2 license from GitHub (https://github.com/hms-dbmi/Rcupcake). Two reproducible case studies are also available (https://github.com/hms-dbmi/Rcupcake-case-studies/blob/master/SSCcaseStudy_v01.ipynb, https://github.com/hms-dbmi/Rcupcake-case-studies/blob/master/NHANEScaseStudy_v01.ipynb). Contact:\ paul_avillach@hms.harvard.edu. Supplementary information:\ Supplementary data are available at Bioinformatics online. }, url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5905576/}, author = {Guti{\'e}rrez-Sacrist{\'a}n, Alba and Guedj, Romain and Korodi, Gabur and Stedman, Jason and Furlong, Laura I and Patel, Chirag J and Kohane, Isaac S and Avillach, Paul} } @article {1489924, title = {Screening pregnant women for suicidal behavior in electronic medical records: diagnostic codes vs. clinical notes processed by natural language processing}, journal = {BMC Med Inform Decis Mak}, volume = {18}, number = {1}, year = {2018}, month = {2018 05 29}, pages = {30}, abstract = {BACKGROUND: We examined the comparative performance of structured, diagnostic codes vs. natural language processing (NLP) of unstructured text for screening suicidal behavior among pregnant women in electronic medical records (EMRs). METHODS: Women aged 10-64 years with at least one diagnostic code related to pregnancy or delivery (N = 275,843) from Partners HealthCare were included as our "datamart." Diagnostic codes related to suicidal behavior were applied to the datamart to screen women for suicidal behavior. Among women without any diagnostic codes related to suicidal behavior (n = 273,410), 5880 women were randomly sampled, of whom 1120 had at least one mention of terms related to suicidal behavior in clinical notes. NLP was then used to process clinical notes for the 1120 women. Chart reviews were performed for subsamples of women. RESULTS: Using diagnostic codes, 196 pregnant women were screened positive for suicidal behavior, among whom 149 (76\%) had confirmed suicidal behavior by chart review. Using NLP among those without diagnostic codes, 486 pregnant women were screened positive for suicidal behavior, among whom 146 (30\%) had confirmed suicidal behavior by chart review. CONCLUSIONS: The use of NLP substantially improves the sensitivity of screening suicidal behavior in EMRs. However, the prevalence of confirmed suicidal behavior was lower among women who did not have diagnostic codes for suicidal behavior but screened positive by NLP. NLP should be used together with diagnostic codes for future EMR-based phenotyping studies for suicidal behavior.}, keywords = {Adolescent, Adult, Child, Electronic Health Records, Female, Humans, Massachusetts, Middle Aged, Natural Language Processing, Pregnancy, Pregnancy Complications, Registries, Suicide, Attempted, Young Adult}, issn = {1472-6947}, doi = {10.1186/s12911-018-0617-7}, author = {Zhong, Qiu-Yue and Karlson, Elizabeth W and Gelaye, Bizu and Finan, Sean and Avillach, Paul and Smoller, Jordan W and Cai, Tianxi and Williams, Michelle A} } @article {1489915, title = {Real-world data reveal a diagnostic gap in non-alcoholic fatty liver disease}, journal = {BMC Med}, volume = {16}, number = {1}, year = {2018}, month = {2018 08 13}, pages = {130}, abstract = {BACKGROUND: Non-alcoholic fatty liver disease (NAFLD) is the most common cause of liver disease worldwide. It affects an estimated 20\% of the general population, based on cohort studies of varying size and heterogeneous selection. However, the prevalence and incidence of recorded NAFLD diagnoses in unselected real-world health-care records is unknown. We harmonised health records from four major European territories and assessed age- and sex-specific point prevalence and incidence of NAFLD over the past decade. METHODS: Data were extracted from The Health Improvement Network (UK), Health Search Database (Italy), Information System for Research in Primary Care (Spain) and Integrated Primary Care Information (Netherlands). Each database uses a different coding system. Prevalence and incidence estimates were pooled across databases by random-effects meta-analysis after a log-transformation. RESULTS: Data were available for 17,669,973 adults, of which 176,114 had a recorded diagnosis of NAFLD. Pooled prevalence trebled from 0.60\% in 2007 (95\% confidence interval: 0.41-0.79) to 1.85\% (0.91-2.79) in 2014. Incidence doubled from 1.32 (0.83-1.82) to 2.35 (1.29-3.40) per 1000 person-years. The FIB-4 non-invasive estimate of liver fibrosis could be calculated in 40.6\% of patients, of whom 29.6-35.7\% had indeterminate or high-risk scores. CONCLUSIONS: In the largest primary-care record study of its kind to date, rates of recorded NAFLD are much lower than expected suggesting under-diagnosis and under-recording. Despite this, we have identified rising incidence and prevalence of the diagnosis. Improved recognition of NAFLD may identify people who will benefit from risk factor modification or emerging therapies to prevent progression to cardiometabolic and hepatic complications.}, keywords = {Databases, Factual, Disease Progression, Female, Humans, Incidence, Male, Middle Aged, Non-alcoholic Fatty Liver Disease, Prevalence, Risk Factors}, issn = {1741-7015}, doi = {10.1186/s12916-018-1103-x}, author = {Alexander, Myriam and Loomis, A Katrina and Fairburn-Beech, Jolyon and van der Lei, Johan and Duarte-Salles, Talita and Prieto-Alhambra, Daniel and Ansell, David and Pasqua, Alessandro and Lapi, Francesco and Rijnbeek, Peter and Mosseveld, Mees and Avillach, Paul and Egger, Peter and Kendrick, Stuart and Waterworth, Dawn M and Sattar, Naveed and Alazawi, William} } @article {1310182, title = {Adverse obstetric and neonatal outcomes complicated by psychosis among pregnant women in the United States}, journal = {BMC Pregnancy Childbirth}, volume = {18}, number = {1}, year = {2018}, abstract = {Adverse obstetric and neonatal outcomes among women with psychosis, particularly affective psychosis, has rarely been studied at the population level. We aimed to assess the risk of adverse obstetric and neonatal outcomes among women with psychosis (schizophrenia, affective psychosis, and other psychoses).\ }, url = {https://bmcpregnancychildbirth.biomedcentral.com/articles/10.1186/s12884-018-1750-0}, author = {Zhong, QY and Gelaye, B and Fricchione, GL and Avillach, P and Karlson, EW and Williams, MA} } @article {1307843, title = {Dementia prevalence and incidence in a federation of European Electronic Health Record databases: The European Medical Informatics Framework resource}, journal = {Alzheimers Dement}, volume = {14}, number = {2}, year = {2018}, month = {2018 Feb}, pages = {130-139}, abstract = {INTRODUCTION: The European Medical Information Framework consortium has assembled electronic health record (EHR) databases for dementia research. We calculated dementia prevalence and incidence in 25 million persons from 2004 to~2012. METHODS: Six EHR databases (three primary care and three secondary care) from five countries were interrogated. Dementia was ascertained by consensus harmonization of clinical/diagnostic codes. Annual period prevalences and incidences by age and gender were calculated and meta-analyzed. RESULTS: The six databases contained 138,625 dementia cases. Age-specific prevalences were around 30\% of published estimates from community samples and incidences were around 50\%. Pooled prevalences had increased from 2004 to 2012 in all age groups but pooled incidences only after age 75~years. Associations with age and gender were stable over time. DISCUSSION: The European Medical Information Framework initiative supports EHR data on unprecedented number of people with dementia. Age-specific prevalences and incidences mirror estimates from community samples in pattern at levels that are lower but increasing over time.}, issn = {1552-5279}, doi = {10.1016/j.jalz.2017.06.2270}, author = {Perera, Gayan and Pedersen, Lars and Ansel, David and Alexander, Myriam and Arrighi, H Michael and Avillach, Paul and Foskett, Nadia and Gini, Rosa and Gordon, Mark F and Gungabissoon, Usha and Mayer, Miguel-Angel and Novak, Gerald and Rijnbeek, Peter and Trifir{\`o}, Gianluca and van der Lei, Johan and Visser, Pieter J and Stewart, Robert} } @article {1307840, title = {Adverse obstetric outcomes during delivery hospitalizations complicated by suicidal behavior among US pregnant women}, journal = {PLoS One}, volume = {13}, number = {2}, year = {2018}, month = {2018}, pages = {e0192943}, abstract = {OBJECTIVE: The effects of suicidal behavior on obstetric outcomes remain dangerously unquantified. We sought to report on the risk of adverse obstetric outcomes for US women with suicidal behavior at the time of delivery. METHODS: We performed a cross-sectional analysis of delivery hospitalizations from 2007-2012 National (Nationwide) Inpatient Sample. From the same hospitalization record, International Classification of Diseases codes were used to identify suicidal behavior and adverse obstetric outcomes. Adjusted odds ratios (aOR) and 95\% confidence intervals (CI) were obtained using logistic regression. RESULTS: Of the 23,507,597 delivery hospitalizations, 2,180 were complicated by suicidal behavior. Women with suicidal behavior were at a heightened risk for outcomes including antepartum hemorrhage (aOR = 2.34; 95\% CI: 1.47-3.74), placental abruption (aOR = 2.07; 95\% CI: 1.17-3.66), postpartum hemorrhage (aOR = 2.33; 95\% CI: 1.61-3.37), premature delivery (aOR = 3.08; 95\% CI: 2.43-3.90), stillbirth (aOR = 10.73; 95\% CI: 7.41-15.56), poor fetal growth (aOR = 1.70; 95\% CI: 1.10-2.62), and fetal anomalies (aOR = 3.72; 95\% CI: 2.57-5.40). No significant association was observed for maternal suicidal behavior with cesarean delivery, induction of labor, premature rupture of membranes, excessive fetal growth, and fetal distress. The mean length of stay was longer for women with suicidal behavior. CONCLUSION: During delivery hospitalization, women with suicidal behavior are at increased risk for many adverse obstetric outcomes, highlighting the importance of screening for and providing appropriate clinical care for women with suicidal behavior during pregnancy.}, issn = {1932-6203}, doi = {10.1371/journal.pone.0192943}, author = {Zhong, Qiu-Yue and Gelaye, Bizu and Smoller, Jordan W and Avillach, Paul and Cai, Tianxi and Williams, Michelle A} } @article {1307884, title = {Development of the Precision Link Biobank at Boston Children{\textquoteright}s Hospital: Challenges and Opportunities}, journal = {J Pers Med}, volume = {7}, number = {4}, year = {2017}, month = {2017 Dec 15}, abstract = {Increasingly, biobanks are being developed to support organized collections of biological specimens and associated clinical information on broadly consented, diverse patient populations. We describe the implementation of a pediatric biobank, comprised of a fully-informed patient cohort linking specimens to phenotypic data derived from electronic health records (EHR). The Biobank was launched after multiple stakeholders{\textquoteright} input and implemented initially in a pilot phase before hospital-wide expansion in 2016. In-person informed consent is obtained from all participants enrolling in the Biobank and provides permission to: (1) access EHR data for research; (2) collect and use residual specimens produced as by-products of routine care; and (3) share de-identified data and specimens outside of the institution. Participants are recruited throughout the hospital, across diverse clinical settings. We have enrolled 4900 patients to date, and 41\% of these have an associated blood sample for DNA processing. Current efforts are focused on aligning the Biobank with other ongoing research efforts at our institution and extending our electronic consenting system to support remote enrollment. A number of pediatric-specific challenges and opportunities is reviewed, including the need to re-consent patients when they reach 18 years of age, the ability to enroll family members accompanying patients and alignment with disease-specific research efforts at our institution and other pediatric centers to increase cohort sizes, particularly for rare diseases.}, issn = {2075-4426}, doi = {10.3390/jpm7040021}, author = {Bourgeois, Florence T and Avillach, Paul and Kong, Sek Won and Heinz, Michelle M and Tran, Tram A and Chakrabarty, Ramkrishna and Bickel, Jonathan and Sliz, Piotr and Borglund, Erin M and Kornetsky, Susan and Mandl, Kenneth D} } @article {1307842, title = {Rcupcake: an R package for querying and analyzing biomedical data through the BD2K PIC-SURE RESTful API}, journal = {Bioinformatics}, year = {2017}, month = {2017 Dec 18}, abstract = {Motivation: In the era of big data and precision medicine, the number of databases containing clinical, environmental, self-reported, and biochemical variables is increasing exponentially. Enabling the experts to focus on their research questions rather than on computational data management, access and analysis is one of the most significant challenges nowadays. Results: We present Rcupcake, an R package that contains a variety of functions for leveraging different databases through the BD2K PIC-SURE RESTful API and facilitating its query, analysis and interpretation. The package offers a variety of analysis and visualization tools, including the study of the phenotype co-occurrence and prevalence, according to multiple layers of data, such as phenome, exposome or genome. Availability: The package is implemented in R and is available under Mozilla v2 license from GitHub (https://github.com/hms-dbmi/Rcupcake). Two reproducible case studies are also available (https://github.com/hms-dbmi/Rcupcake-case-studies/blob/master/SSCcaseStudy_v01.ipynb, https://github.com/hms-dbmi/Rcupcake-case-studies/blob/master/NHANEScaseStudy_v01.ipynb). Contact: paul_avillach@hms.harvard.edu. Supplementary information: Supplementary data are available at Bioinformatics online.}, issn = {1367-4811}, doi = {10.1093/bioinformatics/btx788}, author = {Guti{\'e}rrez-Sacrist{\'a}n, Alba and Guedj, Romain and Korodi, Gabor and Stedman, Jason and Furlong, Laura I and Patel, Chirag J and Kohane, Isaac S and Avillach, Paul} } @article {1264536, title = {Health assessment of French university students and risk factors associated with mental health disorders}, journal = {PLoS One}, volume = {12}, number = {11}, year = {2017}, abstract = { The first year of university is a particularly stressful period and can impact academic performance and students{\textquoteright} health. The aim of this study was to evaluate the health and lifestyle of undergraduates and assess risk factors associated with psychiatric symptoms. Between September 2012 and June 2013, we included all undergraduate students who underwent compulsory a medical visit at the university medical service in Nice (France) during which they were screened for potential diseases during a diagnostic interview. Data were collected prospectively in the CALCIUM database (Consultations Assist{\'e}s par Logiciel pour les Centres Inter-Universitaire de M{\'e}decine) and included information about the students{\textquoteright} lifestyle (living conditions, dietary behavior, physical activity, use of recreational drugs). The prevalence of psychiatric symptoms related to depression, anxiety and panic attacks was assessed and risk factors for these symptoms were analyzed using logistic regression. A total of 4,184 undergraduates were included. Prevalence for depression, anxiety and panic attacks were 12.6\%, 7.6\% and 1.0\%, respectively. During the 30 days preceding the evaluation, 0.6\% of the students regularly drank alcohol, 6.3\% were frequent-to-heavy tobacco smokers, and 10.0\% smoked marijuana. Dealing with financial difficulties and having learning disabilities were associated with psychiatric symptoms. Students who were dissatisfied with their living conditions and those with poor dietary behavior were at risk of depression. Being a woman and living alone were associated with anxiety. Students who screened positively for any psychiatric disorder assessed were at a higher risk of having another psychiatric disorder concomitantly. The prevalence of psychiatric disorders in undergraduate students is low but the rate of students at risk of developing chronic disease is far from being negligible. Understanding predictors for these symptoms may improve students{\textquoteright} health by implementing targeted prevention campaigns. Further research in other French universities is necessary to confirm our results. }, author = {Tran, A and Tran, L and Geghre, N and Darmon, D and Rampal M and Brandone D and Gozzo JM and Haas H and Rebouillat-Savy K and Caci H and Avillach P} } @article {1244276, title = {Combining clinical and genomics queries using i2b2 - Three methods}, journal = {PLoS One}, year = {2017}, abstract = {We are fortunate to be living in an era of twin biomedical data surges: a burgeoning representation of human phenotypes in the medical records of our healthcare systems, and high-throughput sequencing making rapid technological advances. The difficulty representing genomic data and its annotations has almost by itself led to the recognition of a biomedical {\textquotedblleft}Big Data{\textquotedblright} challenge, and the complexity of healthcare data only compounds the problem to the point that coherent representation of both systems on the same platform seems insuperably difficult. We investigated the capability for complex, integrative genomic and clinical queries to be supported in the Informatics for Integrating Biology and the Bedside (i2b2) translational software package. Three different data integration approaches were developed: The first is based on Sequence Ontology, the second is based on the tranSMART engine, and the third on CouchDB. These novel methods for representing and querying complex genomic and clinical data on the i2b2 platform are available today for advancing precision medicine.}, author = {Murphy, Shawn N and Avillach, Paul and Riccardo Bellazzi and Phillips, Lori and Gabetta, Matteo and Eran, Alal and McDuffie, Michael T and Kohane, Isaac S} } @article {1244266, title = {The Georges Pompidou University Hospital Clinical Data Warehouse: A 8-years follow-up experience}, journal = {International Journal of Medical Informatics}, volume = {102}, year = {2017}, pages = {21-28}, abstract = { Background When developed jointly with\ clinical information systems, clinical data warehouses (CDWs) facilitate the reuse of healthcare data and leverage clinical research. Objective To describe both data access and use for clinical research, epidemiology and\ health serviceresearch of the {\textquotedblleft}H{\^o}pital Europ{\'e}en Georges Pompidou{\textquotedblright} (HEGP) CDW. Methods The CDW has been developed since 2008 using an i2b2 platform. It was made available to health professionals and researchers in October 2010. Procedures to access data have been implemented and different access levels have been distinguished according to the nature of queries. Results As of July 2016, the CDW contained the consolidated data of over 860,000 patients followed since the opening of the HEGP hospital in July 2000. These data correspond to more than 122 million clinical item values, 124 million biological item values, and 3.7 million free text reports. The ethics committee of the hospital evaluates all CDW projects that generate\ secondary data\ marts. Characteristics of the 74 research projects validated between January 2011 and December 2015 are described. Conclusion The use of HEGP CDWs is a key facilitator for clinical research studies. It required however important methodological and organizational support efforts from a biomedical informatics department. }, author = {Jannot, Anne-Sophie and Zapletal, Eric and Avillach, Paul and Mamzer, Marie-France and Burgan, Anita and Degoulet, Patrice} } @article {1244256, title = {CodeMapper: semiautomatic coding of case definitions. A contribution from the ADVANCE project}, journal = {Pharmacoepidemiology \& Drug Safety}, volume = {26}, number = {8}, year = {2017}, pages = {990-1005}, abstract = { Assessment of drug and vaccine effects by combining information from different healthcare databases in the European Union requires extensive efforts in the harmonization of codes as different vocabularies are being used across countries. In this paper, we present a web application called CodeMapper, which assists in the mapping of case definitions to codes from different vocabularies, while keeping a transparent record of the complete mapping process. CodeMapper builds upon coding vocabularies contained in the Metathesaurus of the Unified Medical Language System. The mapping approach consists of three phases. First, medical concepts are automatically identified in a free-text case definition. Second, the user revises the set of medical concepts by adding or removing concepts, or expanding them to related concepts that are more general or more specific. Finally, the selected concepts are projected to codes from the targeted coding vocabularies. We evaluated the application by comparing codes that were automatically generated from case definitions by applying CodeMapper{\textquoteright}s concept identification and successive concept expansion, with reference codes that were manually created in a previous epidemiological study. Automated concept identification alone had a sensitivity of 0.246 and positive predictive value (PPV) of 0.420 for reproducing the reference codes. Three successive steps of concept expansion increased sensitivity to 0.953 and PPV to 0.616. Automatic concept identification in the case definition alone was insufficient to reproduce the reference codes, but CodeMapper{\textquoteright}s operations for concept expansion provide an effective, efficient, and transparent way for reproducing the reference codes. }, author = {Becker, Benedikt F.H. and Avillach, Paul and Romio, Silvana and van Mulligan, Erik M. and Weibel, Daniel and Sturkenboom, Miriam C.J.M. and Kors, Jan A.} } @article {1163171, title = {Phelan-McDermid syndrome data network: Integrating patient reported outcomes with clinical notes and curated genetic reports}, journal = {American Journal of Medical Genetics Part B: Neuropsychiatric Genetics}, year = {2017}, abstract = {The heterogeneity of patient phenotype data are an impediment to the research into the origins and progression of neuropsychiatric disorders. This difficulty is compounded in the case of rare disorders such as Phelan-McDermid Syndrome (PMS) by the paucity of patient clinical data. PMS is a rare syndromic genetic cause of autism and intellectual deficiency. In this paper, we describe the Phelan-McDermid Syndrome Data Network (PMS_DN), a platform that facilitates research into phenotype{\textendash}genotype correlation and progression of PMS by: a) integrating knowledge of patient phenotypes extracted from Patient Reported Outcomes (PRO) data and clinical notes{\textemdash}two heterogeneous, underutilized sources of knowledge about patient phenotypes{\textemdash}with curated genetic information from the same patient cohort and b) making this integrated knowledge, along with a suite of statistical tools, available free of charge to authorized investigators on a Web portal\ https://pmsdn.hms.harvard.edu. PMS_DN is a Patient Centric Outcomes Research Initiative (PCORI) where patients and their families are involved in all aspects of the management of patient data in driving research into PMS. To foster collaborative research, PMS_DN also makes patient aggregates from this knowledge available to authorized investigators using distributed research networks such as the PCORnet PopMedNet. PMS_DN is hosted on a scalable cloud based environment and complies with all patient data privacy regulations. As of October 31, 2016, PMS_DN integrates high-quality knowledge extracted from the clinical notes of 112 patients and curated genetic reports of 176 patients with preprocessed PRO data from 415 patients.}, url = {http://onlinelibrary.wiley.com/doi/10.1002/ajmg.b.32579/full}, author = {Kothari, Cartik and Wack, Maxime and Hassen-Khodja, Claire and Finan, Sean and Savova, Guergana and O{\textquoteright}Boyle, Megan and Bliss, Geraldine and Cornell, Andria and Horn, Elizabeth and Davis, Rebecca and Jacobs, Jacquelyn and Kohane, Isaac and Avillach, Paul} } @article {932556, title = {A database of human exposomes and phenomes from the US National Health and Nutrition Examination Survey.}, journal = {Sci Data}, volume = {3}, year = {2016}, month = {2016 Oct 25}, pages = {160096}, abstract = {The National Health and Nutrition Examination Survey (NHANES) is a population survey implemented by the Centers for Disease Control and Prevention (CDC) to monitor the health of the United States whose data is publicly available in hundreds of files. This Data Descriptor describes a single unified and universally accessible data file, merging across 255 separate files and stitching data across 4 surveys, encompassing 41,474 individuals and 1,191 variables. The variables consist of phenotype and environmental exposure information on each individual, specifically (1) demographic information, physical exam results (e.g., height, body mass index), laboratory results (e.g., cholesterol, glucose, and environmental exposures), and (4) questionnaire items. Second, the data descriptor describes a dictionary to enable analysts find variables by category and human-readable description. The datasets are available on DataDryad and a hands-on analytics tutorial is available on GitHub. Through a new big data platform, BD2K Patient Centered Information Commons (http://pic-sure.org), we provide a new way to browse the dataset via a web browser (https://nhanes.hms.harvard.edu) and provide application programming interface for programmatic access.}, issn = {2052-4463}, doi = {10.1038/sdata.2016.96}, author = {Patel, Chirag J and Pho, Nam and McDuffie, Michael and Easton-Marks, Jeremy and Kothari, Cartik and Kohane, Isaac S and Avillach, Paul} } @article {882076, title = {An informatics research agenda to support precision medicine: seven key areas.}, journal = {J Am Med Inform Assoc}, volume = {23}, number = {4}, year = {2016}, month = {2016 Jul}, pages = {791-5}, abstract = {The recent announcement of the Precision Medicine Initiative by President Obama has brought precision medicine (PM) to the forefront for healthcare providers, researchers, regulators, innovators, and funders alike. As technologies continue to evolve and datasets grow in magnitude, a strong computational infrastructure will be essential to realize PM{\textquoteright}s vision of improved healthcare derived from personal data. In addition, informatics research and innovation affords a tremendous opportunity to drive the science underlying PM. The informatics community must lead the development of technologies and methodologies that will increase the discovery and application of biomedical knowledge through close collaboration between researchers, clinicians, and patients. This perspective highlights seven key areas that are in need of further informatics research and innovation to support the realization of PM.}, issn = {1527-974X}, doi = {10.1093/jamia/ocv213}, author = {Tenenbaum, Jessica D and Avillach, Paul and Benham-Hutchins, Marge and Breitenstein, Matthew K and Crowgey, Erin L and Hoffman, Mark A and Jiang, Xia and Madhavan, Subha and Mattison, John E and Nagarajan, Radhakrishnan and Ray, Bisakha and Shin, Dmitriy and Visweswaran, Shyam and Zhao, Zhongming and Freimuth, Robert R} } @article {882066, title = {Data Extraction and Management in Networks of Observational Health Care Databases for Scientific Research: A Comparison of EU-ADR, OMOP, Mini-Sentinel and MATRICE Strategies.}, journal = {EGEMS (Wash DC)}, volume = {4}, number = {1}, year = {2016}, month = {2016}, pages = {1189}, abstract = {INTRODUCTION: We see increased use of existing observational data in order to achieve fast and transparent production of empirical evidence in health care research. Multiple databases are often used to increase power, to assess rare exposures or outcomes, or to study diverse populations. For privacy and sociological reasons, original data on individual subjects can{\textquoteright}t be shared, requiring a distributed network approach where data processing is performed prior to data sharing. CASE DESCRIPTIONS AND VARIATION AMONG SITES: We created a conceptual framework distinguishing three steps in local data processing: (1) data reorganization into a data structure common across the network; (2) derivation of study variables not present in original data; and (3) application of study design to transform longitudinal data into aggregated data sets for statistical analysis. We applied this framework to four case studies to identify similarities and differences in the United States and Europe: Exploring and Understanding Adverse Drug Reactions by Integrative Mining of Clinical Records and Biomedical Knowledge (EU-ADR), Observational Medical Outcomes Partnership (OMOP), the Food and Drug Administration{\textquoteright}s (FDA{\textquoteright}s) Mini-Sentinel, and the Italian network-the Integration of Content Management Information on the Territory of Patients with Complex Diseases or with Chronic Conditions (MATRICE). FINDINGS: National networks (OMOP, Mini-Sentinel, MATRICE) all adopted shared procedures for local data reorganization. The multinational EU-ADR network needed locally defined procedures to reorganize its heterogeneous data into a common structure. Derivation of new data elements was centrally defined in all networks but the procedure was not shared in EU-ADR. Application of study design was a common and shared procedure in all the case studies. Computer procedures were embodied in different programming languages, including SAS, R, SQL, Java, and C++. CONCLUSION: Using our conceptual framework we found several areas that would benefit from research to identify optimal standards for production of empirical knowledge from existing databases.an opportunity to advance evidence-based care management. In addition, formalized CM outcomes assessment methodologies will enable us to compare CM effectiveness across health delivery settings.}, issn = {2327-9214}, doi = {10.13063/2327-9214.1189}, author = {Gini, Rosa and Schuemie, Martijn and Brown, Jeffrey and Ryan, Patrick and Vacchi, Edoardo and Coppola, Massimo and Cazzola, Walter and Coloma, Preciosa and Berni, Roberto and Diallo, Gayo and Oliveira, Jos{\'e} Luis and Avillach, Paul and Trifir{\`o}, Gianluca and Rijnbeek, Peter and Bellentani, Mariadonata and van der Lei, Johan and Klazinga, Niek and Sturkenboom, Miriam} } @article {882061, title = {Identifying Cases of Type 2 Diabetes in Heterogeneous Data Sources: Strategy from the EMIF Project.}, journal = {PLoS One}, volume = {11}, number = {8}, year = {2016}, month = {2016}, pages = {e0160648}, abstract = {Due to the heterogeneity of existing European sources of observational healthcare data, data source-tailored choices are needed to execute multi-data source, multi-national epidemiological studies. This makes transparent documentation paramount. In this proof-of-concept study, a novel standard data derivation procedure was tested in a set of heterogeneous data sources. Identification of subjects with type 2 diabetes (T2DM) was the test case. We included three primary care data sources (PCDs), three record linkage of administrative and/or registry data sources (RLDs), one hospital and one biobank. Overall, data from 12 million subjects from six European countries were extracted. Based on a shared event definition, sixteeen standard algorithms (components) useful to identify T2DM cases were generated through a top-down/bottom-up iterative approach. Each component was based on one single data domain among diagnoses, drugs, diagnostic test utilization and laboratory results. Diagnoses-based components were subclassified considering the healthcare setting (primary, secondary, inpatient care). The Unified Medical Language System was used for semantic harmonization within data domains. Individual components were extracted and proportion of population identified was compared across data sources. Drug-based components performed similarly in RLDs and PCDs, unlike diagnoses-based components. Using components as building blocks, logical combinations with AND, OR, AND NOT were tested and local experts recommended their preferred data source-tailored combination. The population identified per data sources by resulting algorithms varied from 3.5\% to 15.7\%, however, age-specific results were fairly comparable. The impact of individual components was assessed: diagnoses-based components identified the majority of cases in PCDs (93-100\%), while drug-based components were the main contributors in RLDs (81-100\%). The proposed data derivation procedure allowed the generation of data source-tailored case-finding algorithms in a standardized fashion, facilitated transparent documentation of the process and benchmarking of data sources, and provided bases for interpretation of possible inter-data source inconsistency of findings in future studies.}, issn = {1932-6203}, doi = {10.1371/journal.pone.0160648}, author = {Roberto, Giuseppe and Leal, Ingrid and Sattar, Naveed and Loomis, A Katrina and Avillach, Paul and Egger, Peter and van Wijngaarden, Rients and Ansell, David and Reisberg, Sulev and Tammesoo, Mari-Liis and Alavere, Helene and Pasqua, Alessandro and Pedersen, Lars and Cunningham, James and Tramontan, Lara and Mayer, Miguel A and Herings, Ron and Coloma, Preciosa and Lapi, Francesco and Sturkenboom, Miriam and van der Lei, Johan and Schuemie, Martijn J and Rijnbeek, Peter and Gini, Rosa} } @article {633561, title = {Posterior Radioscaphoid Angle as a Predictor of Wrist Degenerative Joint Disease in Patients With Scapholunate Ligament Tears.}, journal = {AJR Am J Roentgenol}, volume = {206}, number = {1}, year = {2016}, month = {2016 Jan}, pages = {144-50}, abstract = {OBJECTIVE: The purpose of this study is to determine whether the posterior radioscaphoid angle, a marker of posterior displacement of the scaphoid, is associated with degenerative joint disease in patients with scapholunate ligament tears. MATERIALS AND METHODS: Images from 150 patients with wrist pain who underwent CT arthrography and radiography were retrospectively evaluated. Patients with and without scapholunate ligament ruptures were divided into two groups according to CT arthrography findings. The presence of degenerative changes (scapholunate advanced collapse [SLAC] wrist) was evaluated and graded on conventional radiographs. Images were evaluated by two readers independently, and an adjudicator analyzed the discordant cases. Posterior radioscaphoid angle values were correlated with CT arthrography and radiographic findings. The association between posterior radioscaphoid angle and degenerative joint disease was evaluated. Scapholunate and radiolunate angles were considered in the analysis. RESULTS: The posterior radioscaphoid angle was measurable in all patients, with substantial interobserver agreement (intraclass correlation coefficient, 0.75). The posterior radioscaphoid angle performed better than did the scapholunate and radiolunate angles in the differentiation of patients with and without SLAC wrist (p < 0.02). Posterior radioscaphoid angles greater than 114{\textdegree} presented an 80.0\% sensitivity and 89.7\% specificity for the detection of SLAC wrist. CONCLUSION: Posterior radioscaphoid angles were strongly associated with degenerative wrist disease, with potential prognostic implications in patients with wrist trauma and scapholunate ligament ruptures.}, issn = {1546-3141}, doi = {10.2214/AJR.15.14606}, author = {Gondim Teixeira, Pedro Augusto and De Verbizier, Jacques and Aptel, Sabine and Wack, Maxime and Dap, Fran{\c c}ois and Dautel, Gilles and Blum, Alain} } @article {1217301, title = {Conceptual Knowledge Discovery in Databases for Drug Combinations Predictions in Malignant Melanoma}, journal = {Stud Health Technol Inform}, volume = {216}, year = {2015}, month = {2015}, pages = {663-7}, abstract = {The worldwide incidence of melanoma is rising faster than any other cancer, and prognosis for patients with metastatic disease is poor. Current targeted therapies are limited in their durability and/or effect size in certain patient populations due to acquired mechanisms of resistance. Thus, the development of synergistic combinatorial treatment regimens holds great promise to improve patient outcomes. We have previously shown that a model for in-silico knowledge discovery, Translational Ontology-anchored Knowledge Discovery Engine (TOKEn), is able to generate valid relationships between bimolecular and clinical phenotypes. In this study, we have aggregated observational and canonical knowledge consisting of melanoma-related biomolecular entities and targeted therapeutics in a computationally tractable model. We demonstrate here that the explicit linkage of therapeutic modalities with biomolecular underpinnings of melanoma utilizing the TOKEn pipeline yield a set of informed relationships that have the potential to generate combination therapy strategies.}, keywords = {Antineoplastic Combined Chemotherapy Protocols, Clinical Pharmacy Information Systems, data mining, Databases, Pharmaceutical, Decision Support Systems, Clinical, Knowledge Bases, machine learning, Melanoma, Natural Language Processing, Skin Neoplasms}, issn = {0926-9630}, author = {Regan, Kelly and Raje, Satyajeet and Saravanamuthu, Cartik and Payne, Philip R O} } @article {539701, title = {Evaluating the Impact of Computerized Provider Order Entry on Medical Students Training at Bedside: A Randomized Controlled Trial.}, journal = {PLoS One}, volume = {10}, number = {9}, year = {2015}, month = {2015}, pages = {e0138094}, abstract = {OBJECTIVE: To evaluate the impact of computerized provider order entry (CPOE) at the bedside on medical students training. MATERIALS AND METHODS: We conducted a randomized cross-controlled educational trial on medical students during two clerkship rotations in three departments, assessing the impact of the use of CPOE on their ability to place adequate monitoring and therapeutic orders using a written test before and after each rotation. Students{\textquoteright} satisfaction with their practice and the order placement system was surveyed. A multivariate mixed model was used to take individual students and chief resident (CR) effects into account. Factorial analysis was applied on the satisfaction questionnaire to identify dimensions, and scores were compared on these dimensions. RESULTS: Thirty-six students show no better progress (beginning and final test means = 69.87 and 80.98 points out of 176 for the control group, 64.60 and 78.11 for the CPOE group, p = 0.556) during their rotation in either group, even after adjusting for each student and CR, but show a better satisfaction with patient care and greater involvement in the medical team in the CPOE group (p = 0.035*). Both groups have a favorable opinion regarding CPOE as an educational tool, especially because of the order reviewing by the supervisor. CONCLUSION: This is the first randomized controlled trial assessing the performance of CPOE in both the progress in prescriptions ability and satisfaction of the students. The absence of effect on the medical skills must be weighted by the small time scale and low sample size. However, students are more satisfied when using CPOE rather than usual training.}, issn = {1932-6203}, doi = {10.1371/journal.pone.0138094}, author = {Wack, Maxime and Puymirat, Etienne and Ranque, Brigitte and Georgin-Lavialle, Sophie and Pierre, Isabelle and Tanguy, Aurelia and Ackermann, Felix and Mallet, Celine and Pavie, Juliette and Boultache, Hakima and Durieux, Pierre and Avillach, Paul} } @article {505021, title = {Acute graft-versus-host disease, invasive aspergillosis and Clostridium difficile colitis after peripheral blood stem cell transplantation: A complex network of causalities and a challenge for prevention.}, journal = {Anaerobe}, volume = {33}, year = {2015}, month = {2015 Jun}, pages = {98-100}, abstract = {Graft-versus-host disease (GVHD) is a known risk factor for invasive aspergillosis (IA), but remains poorly studied in relation to Clostridium difficile infection (CDI). We report a case of a 58-years-old patient who developed an IA within a protected room, CDI and GVHD after allogeneic allogeneic peripheral blood stem cell transplantation (PBSCT). Factors associated with this complex condition in patients receiving allogeneic PBSCT need to be identified.}, issn = {1095-8274}, doi = {10.1016/j.anaerobe.2015.02.007}, author = {Khanafer, Nagham and Neuraz, Antoine and B{\'e}net, Thomas and Cour, Martin and Persat, Florence and Labussi{\`e}re, H{\'e}l{\`e}ne and Argaud, Laurent and Michallet, Mauricette and Vanhems, Philippe} } @article {505011, title = {Comment on: Persistence and adherence to single-tablet regimens in HIV treatment: a cohort study from the French National Healthcare Insurance Database.}, journal = {J Antimicrob Chemother}, year = {2015}, month = {2015 Aug 7}, issn = {1460-2091}, doi = {10.1093/jac/dkv242}, author = {Grammatico-Guillon, Leslie and Gras, Guillaume and Hassen-Khodja, Claire and Maakaroun, Zoha and Bastides, Fr{\'e}d{\'e}ric and Barin, Francis and Bernard, Louis} } @article {505031, title = {An Integrated Workflow For Secondary Use of Patient Data for Clinical Research.}, journal = {Stud Health Technol Inform}, volume = {216}, year = {2015}, month = {2015}, pages = {913}, abstract = {This work proposes an integrated workflow for secondary use of medical data to serve feasibility studies, and the prescreening and monitoring of research studies. All research issues are initially addressed by the Clinical Research Office through a research portal and subsequently redirected to relevant experts in the determined field of concentration. For secondary use of data, the workflow is then based on the clinical data warehouse of the hospital. A datamart with potentially eligible research candidates is constructed. Datamarts can either produce aggregated data, de-identified data, or identified data, according to the kind of study being treated. In conclusion, integrating the secondary use of data process into a general research workflow allows visibility of information technologies and improves the accessability of clinical data.}, issn = {0926-9630}, author = {Bouzill{\'e}, Guillaume and Sylvestre, Emmanuelle and Campillo-Gimenez, Boris and Renault, Eric and Ledieu, Thibault and Delamarre, Denis and Cuggia, Marc} } @article {505026, title = {A Metadata based Knowledge Discovery Methodology for Seeding Translational Research.}, journal = {Stud Health Technol Inform}, volume = {216}, year = {2015}, month = {2015}, pages = {1071}, abstract = {In this paper, we present a semantic, metadata based knowledge discovery methodology for identifying teams of researchers from diverse backgrounds who can collaborate on interdisciplinary research projects: projects in areas that have been identified as high-impact areas at The Ohio State University. This methodology involves the semantic annotation of keywords and the postulation of semantic metrics to improve the efficiency of the path exploration algorithm as well as to rank the results. Results indicate that our methodology can discover groups of experts from diverse areas who can collaborate on translational research projects.}, issn = {0926-9630}, author = {Kothari, Cartik R and Payne, Philip R O} } @article {505016, title = {Patient Mortality Is Associated With Staff Resources and Workload in the ICU: A Multicenter Observational Study.}, journal = {Crit Care Med}, volume = {43}, number = {8}, year = {2015}, month = {2015 Aug}, pages = {1587-94}, abstract = {OBJECTIVE: Matching healthcare staff resources to patient needs in the ICU is a key factor for quality of care. We aimed to assess the impact of the staffing-to-patient ratio and workload on ICU mortality. DESIGN: We performed a multicenter longitudinal study using routinely collected hospital data. SETTING: Information pertaining to every patient in eight ICUs from four university hospitals from January to December 2013 was analyzed. PATIENTS: A total of 5,718 inpatient stays were included. INTERVENTIONS: None. MEASUREMENTS AND MAIN RESULTS: We used a shift-by-shift varying measure of the patient-to-caregiver ratio in combination with workload to establish their relationships with ICU mortality over time, excluding patients with decision to forego life-sustaining therapy. Using a multilevel Poisson regression, we quantified ICU mortality-relative risk, adjusted for patient turnover, severity, and staffing levels. The risk of death was increased by 3.5 (95\% CI, 1.3-9.1) when the patient-to-nurse ratio was greater than 2.5, and it was increased by 2.0 (95\% CI, 1.3-3.2) when the patient-to-physician ratio exceeded 14. The highest ratios occurred more frequently during the weekend for nurse staffing and during the night for physicians (p < 0.001). High patient turnover (adjusted relative risk, 5.6 [2.0-15.0]) and the volume of life-sustaining procedures performed by staff (adjusted relative risk, 5.9 [4.3-7.9]) were also associated with increased mortality. CONCLUSIONS: This study proposes evidence-based thresholds for patient-to-caregiver ratios, above which patient safety may be endangered in the ICU. Real-time monitoring of staffing levels and workload is feasible for adjusting caregivers{\textquoteright} resources to patients{\textquoteright} needs.}, issn = {1530-0293}, doi = {10.1097/CCM.0000000000001015}, author = {Neuraz, Antoine and Gu{\'e}rin, Claude and Payet, C{\'e}cile and Polazzi, St{\'e}phanie and Aubrun, Fr{\'e}d{\'e}ric and Dailler, Fr{\'e}d{\'e}ric and Lehot, Jean-Jacques and Piriou, Vincent and Neidecker, Jean and Rimmel{\'e}, Thomas and Schott, Anne-Marie and Duclos, Antoine} } @article {505036, title = {Vaccination coverage of children with inflammatory bowel disease after an awareness campaign on the risk of infection.}, journal = {Dig Liver Dis}, volume = {47}, number = {6}, year = {2015}, month = {2015 Jun}, pages = {460-4}, abstract = {BACKGROUND: Children with inflammatory bowel disease are at risk of vaccine-preventable diseases mostly due to immunosuppressive drugs. AIM: To evaluate coverage after an awareness campaign informing patients, their parents and general practitioner about the vaccination schedule. METHODS: Vaccination coverage was firstly evaluated and followed by an awareness campaign on the risk of infection via postal mail. The trial is a case-control study on the same patients before and after the awareness campaign. Overall, 92 children were included. A questionnaire was then completed during a routine appointment to collect data including age at diagnosis, age at data collection, treatment history, and vaccination status. RESULTS: Vaccination rates significantly increased for vaccines against diphtheria-tetanus-poliomyelitis (92\% vs. 100\%), Haemophilus influenzae (88\% vs. 98\%), hepatitis B (52\% vs. 71\%), pneumococcus (36\% vs. 57\%), and meningococcus C (17\% vs. 41\%) (p, issn = {1878-3562}, doi = {10.1016/j.dld.2015.02.009}, author = {Fleurier, Aude and Pelatan, Cecile and Willot, Stephanie and Ginies, Jean-Louis and Breton, Estelle and Bridoux, Laure and Segura, Jean-Francois and Chaillou, Emilie and Jobert, Agathe and Darviot, Estelle and Cagnard, Benoit and Delaperriere, Nadege and Grimal, Isabelle and Carre, Emilie and Wagner, Anne-Claire and Sylvestre, Emmanuelle and Dabadie, Alain} } @article {504826, title = {[Evaluation of the theoretical teaching of postgraduate medical students in France].}, journal = {Rev Med Interne}, year = {2015}, month = {2015 May 13}, abstract = {OBJECTIVES: In France, medical students regularly complain about the shortcomings of their theoretical training and the necessity of its adaptation to better fit the needs of students. The goal was to evaluate the theoretical teaching practices in postgraduate medical studies by: 1) collecting data from medical students in different medical faculties in France; 2) comparing this data with expected practices when it is possible; 3) and proposing several lines of improvement. METHODS: A survey of theoretical practices in the 3rd cycle of medical studies was conducted by self-administered questionnaires which were free of charge, anonymous, and administered electronically from July 3~to October 31, 2013~to all medical students in France. RESULTS: National, inter-regional, regional and field internship educational content was absent in respectively 50.5\%, 42.8\%, 26.0\% and 30.2\% of cases. Medical students follow complementary training due to insufficient DES and/or DESC 2~training in 43.7\% of cases or as part of a professional project in 54.9\% of cases. The knowledge sought by medical students concerns the following crosscutting topics: career development (58.9\%), practice management (50.7\%), medical English (50.4\%) and their specialty organization (49.9\%). Fifty-four point one percent would like to be evaluated on their theoretical training on an annual basis. CONCLUSION: The results of this first national survey give insights into the theoretical teaching conditions in postgraduate medical education in France and the aspirations of medical students.}, issn = {1768-3122}, doi = {10.1016/j.revmed.2015.02.009}, author = {Faivre, J-C and Agopiantz, M and Loeb, E and Cassinari, K and Wack, M and Catoire, P and Braun, M and Thilly, N and Coudane, H} } @article {504776, title = {Detection of Drug-Drug Interactions Inducing Acute Kidney Injury by Electronic Health Records Mining}, journal = {Drug Safety}, year = {2015}, abstract = { BACKGROUND AND OBJECTIVE: While risk of acute kidney injury (AKI) is a well documented adverse effect of some drugs, few studies have assessed the relationship between drug-drug interactions (DDIs) and AKI. Our objective was to develop an algorithm capable of detecting potential signals on this relationship by retrospectively mining data from electronic health records. MATERIAL AND METHODS: Data were extracted from the clinical data warehouse (CDW) of the H{\^o}pital Europ{\'e}en Georges Pompidou (HEGP). AKI was defined as the first level of the RIFLE criteria, that is, an increase >=50\ \% of creatinine basis. Algorithm accuracy was tested on 20 single drugs, 10 nephrotoxic and 10 non-nephrotoxic. We then tested 45 pairs of non-nephrotoxic drugs, among the most prescribed at our hospital and representing distinct pharmacological classes for DDIs. RESULTS: Sensitivity and specificity were 50\ \% [95\ \% confidence interval (CI) 23.66-76.34] and 90\ \% (95\ \% CI 59.58-98.21), respectively, for single drugs. Our algorithm confirmed a previously identified signal concerning clarithromycin and calcium-channel blockers (unadjusted odds ratio (ORu) 2.92; 95\ \% CI 1.11-7.69, p\ =\ 0.04). Among the 45 drug pairs investigated, we identified a signal concerning 55 patients in association with bromazepam and hydroxyzine (ORu 1.66; 95\ \% CI 1.23-2.23). This signal was not confirmed after a chart review. Even so, AKI and co-prescription were confirmed for 96\ \% (95\ \% CI 88-99) and 88\ \% (95\ \% CI 76-94) of these patients, respectively. CONCLUSION: Data mining techniques on CDW can foster the detection of adverse drug reactions when drugs are used alone or in combination. }, author = {Girardeau, Y and Trivin, C and Durieux, P and Le Beller, C and Louet Agnes, LL and Neuraz, A and Degoulet, P and Avillach, P} } @article {504771, title = {Prevalence of Inflammatory Bowel Disease Among Patience with Autism Spectrum Disorders}, journal = {Inflammatory Bowel Disease}, year = {2015}, abstract = {Background: The objective of this study was to measure the prevalence of inflammatory bowel disease (IBD) among patients with autism spectrum disorders (ASD), which has not been well described previously. Methods: The rates of IBD among patients with and without ASD were measured in 4 study populations with distinct modes of ascertainment: a health care benefits company, 2 pediatric tertiary care centers, and a national ASD repository. The rates of IBD (established through International Classification of Diseases, Ninth Revision, Clinical Modification [ICD-9-CM] codes) were compared with respective controls and combined using a Stouffer meta-analysis. Clinical charts were also reviewed for IBD among patients with ICD-9-CM codes for both IBD and ASD at one of the pediatric tertiary care centers. This expert-verified rate was compared with the rate in the repository study population (where IBD diagnoses were established by expert review) and in nationally reported rates for pediatric IBD. Results: In all of case-control study populations, the rates of IBD-related ICD-9-CM codes for patients with ASD were significantly higher than that of their respective controls (Stouffer meta-analysis, P \< 0.001). Expert-verified rates of IBD among patients with ASD were 7 of 2728 patients in one study population and 16 of 7201 in a second study population. The age-adjusted prevalence of IBD among patients with ASD was higher than their respective controls and nationally reported rates of pediatric IBD. Conclusions: Across each population with different kinds of ascertainment, there was a consistent and statistically significant increased prevalance of IBD in patients with ASD than their respective controls and nationally reported rates for pediatric IBD.}, url = {http://www.ncbi.nlm.nih.gov/pubmed/26218138}, author = {Doshi-Velez, F and Avillach, P and Palmer, N and Bousvaros, A and Ge, Y and Fox, K and Steinberg, G and Spettell, C and Juster, I and Kohane, I} } @article {292826, title = {Translational research platforms integrating clinical and omics data: a review of publicly available solutions.}, journal = {Brief Bioinform}, volume = {16}, number = {2}, year = {2015}, month = {2015 Mar}, pages = {280-90}, abstract = {The rise of personalized medicine and the availability of high-throughput molecular analyses in the context of clinical care have increased the need for adequate tools for translational researchers to manage and explore these data. We reviewed the biomedical literature for translational platforms allowing the management and exploration of clinical and omics data, and identified seven platforms: BRISK, caTRIP, cBio Cancer Portal, G-DOC, iCOD, iDASH and tranSMART. We analyzed these platforms along seven major axes. (1) The community axis regrouped information regarding initiators and funders of the project, as well as availability status and references. (2) We regrouped under the information content axis the nature of the clinical and omics data handled by each system. (3) The privacy management environment axis encompassed functionalities allowing control over data privacy. (4) In the analysis support axis, we detailed the analytical and statistical tools provided by the platforms. We also explored (5) interoperability support and (6) system requirements. The final axis (7) platform support listed the availability of documentation and installation procedures. A large heterogeneity was observed in regard to the capability to manage phenotype information in addition to omics data, their security and interoperability features. The analytical and visualization features strongly depend on the considered platform. Similarly, the availability of the systems is variable. This review aims at providing the reader with the background to choose the platform best suited to their needs. To conclude, we discuss the desiderata for optimal translational research platforms, in terms of privacy, interoperability and technical features.}, issn = {1477-4054}, doi = {10.1093/bib/bbu006}, author = {Canuel, Vincent and Rance, Bastien and Avillach, Paul and Degoulet, Patrice and Burgun, Anita} } @article {356346, title = {Limiting a Medline/PubMed query to the "best" articles using the JCR relative impact factor.}, journal = {Rev Epidemiol Sante Publique}, volume = {62}, number = {6}, year = {2014}, month = {2014 Dec}, pages = {361-5}, abstract = {BACKGROUND: Medline/PubMed is the most frequently used medical bibliographic research database. The aim of this study was to propose a new generic method to limit any Medline/PubMed query based on the relative impact factor and the A \& B categories of the SIGAPS score. MATERIAL AND METHODS: The entire PubMed corpus was used for the feasibility study, then ten frequent diseases in terms of PubMed indexing and the citations of four Nobel prize winners. The relative impact factor (RIF) was calculated by medical specialty defined in Journal Citation Reports. The two queries, which included all the journals in category A (or A OR B), were added to any Medline/PubMed query as a central point of the feasibility study. RESULTS: Limitation using the SIGAPS category A was larger than the when using the Core Clinical Journals (CCJ): 15.65\% of PubMed corpus vs 8.64\% for CCJ. The response time of this limit applied to the entire PubMed corpus was less than two seconds. For five diseases out of ten, limiting the citations with the RIF was more effective than with the CCJ. For the four Nobel prize winners, limiting the citations with the RIF was more effective than the CCJ. CONCLUSION: The feasibility study to apply a new filter based on the relative impact factor on any Medline/PubMed query was positive.}, issn = {0398-7620}, doi = {10.1016/j.respe.2014.09.008}, author = {Avillach, P and Kerdelhu{\'e}, G and Devos, P and Maisonneuve, H and Darmoni, S J} } @article {356026, title = {Hospital and ambulatory management, and compliance to treatment in HIV infection: regional health insurance agency analysis.}, journal = {Med Mal Infect}, volume = {44}, number = {9}, year = {2014}, month = {2014 Sep}, pages = {423-8}, abstract = {OBJECTIVE: We had for objective to study HIV management (hospital, ambulatory, and mixed) and assess compliance with health insurance database. METHOD: We conducted a retrospective study using the French Social Security (CPAM) database. The inclusion criteria were: age>18years of age, at least 2 prescriptions of antiretroviral therapy. RESULTS: Five hundred and seventy-five patients were included: extra-hospital (12), hospital (162), mixed (401). The prescriptions were exclusively hospital issued for 76.2\% of the patients. Among the mixed group patients, 91\% of treatments were delivered at least once in the community, and 45.6\% of biological tests were performed in private laboratories at least once. The sex ratio (2.1 vs. 1.3), the number of patients having switched antiretroviral therapy (36.7\% vs. 27.8\%), and the frequency of biological tests (3.1 vs. 2.6) were significantly higher in the mixed group compared to the hospital group. The mean compliance was 90\% in the hospital group and 91.8\% in the mixed group. The compliance was, issn = {1769-6690}, doi = {10.1016/j.medmal.2014.08.004}, author = {Hassen-Khodja, C and Gras, G and Grammatico-Guillon, L and Dupuy, C and Gomez, J-F and Freslon, L and Dailloux, J-F and Soufflet, A and Bernard, L} } @article {292866, title = {Pilot evaluation of an automated method to decrease false-positive signals induced by co-prescriptions in spontaneous reporting databases.}, journal = {Pharmacoepidemiol Drug Saf}, volume = {23}, number = {2}, year = {2014}, month = {2014 Feb}, pages = {186-94}, abstract = {PURPOSE: To test an automated method to decrease the number of false-positive (FP) signals of disproportionate reportings (SDRs) generated by co-prescription. METHODS: Automated backward stepwise removal of reports concerning the drug associated with the highest ranked SDR for an event was tested for gastric and oesophageal haemorrhages (GOH), central nervous system haemorrhages and cerebrovascular accidents (CNSH), ischaemic coronary artery disorders and muscle pains (MP) using the reporting odds ratio in the French spontaneous reporting research database. After ranking SDRs detected in the complete dataset on the lower limit of the reporting odds ratio 95\% confidence interval, reports concerning the drug with the highest ranked SDR were removed. In the dataset thus generated, SDRs were again identified, ranked and reports related to the drug involved in the newly highest ranked SDR removed. The process was repeated until no signal was detected. Initially detected SDRs eliminated using this technique were assessed regarding the summary of products characteristics and the literature to determine their FP nature. RESULTS: Seventeen SDRs were successively eliminated for GOH, 37 for CNSH, 15 for ischaemic coronary artery disorders, and 36 for MP. Four were FP for GOH, 29 for CNSH, 7 for ACI and none were FP for MP. The positive predictive value of the backward stepwise removal procedure in identifying FP SDRs ranged from 0\% (MP) to 78.4\% (CNSH). CONCLUSIONS: Although further adjustment is needed to improve the method presented herein, our results suggest that numerous FP signals because of co-prescription bias could be eliminated using an automated method.}, keywords = {Adverse Drug Reaction Reporting Systems, Automation, Bias (Epidemiology), Databases, Factual, Drug-Related Side Effects and Adverse Reactions, False Positive Reactions, France, Humans, Limit of Detection, Pharmacovigilance, Pilot Projects, Prescription Drugs}, issn = {1099-1557}, doi = {10.1002/pds.3454}, author = {Avillach, Paul and Salvo, Francesco and Thiessard, Frantz and Miremont-Salam{\'e}, Ghada and Fourrier-Reglat, Annie and Haramburu, Fran{\c c}oise and B{\'e}gaud, Bernard and Moore, Nicholas and Pariente, Antoine} } @article {292821, title = {Etiologies and diagnostic work-up of extreme macrocytosis defined by an erythrocyte mean corpuscular volume over 130{\textdegree}fL: A study of 109 patients.}, journal = {Am J Hematol}, volume = {89}, number = {6}, year = {2014}, month = {2014 Jun}, pages = {665-6}, keywords = {Anemia, Macrocytic, Blood Cell Count, Erythrocyte Indices, Female, Humans, Male, Middle Aged}, issn = {1096-8652}, doi = {10.1002/ajh.23718}, author = {Planche, Virginie and Georgin-Lavialle, Sophie and Avillach, Paul and Ranque, Brigitte and Pavie, Juliette and Caruba, Thibaut and Darnige, Luc and Pouchot, Jacques} } @article {292811, title = {Guide to good practices to ensure privacy protection in secondary use of medical records.}, journal = {Rev Epidemiol Sante Publique}, volume = {62}, number = {3}, year = {2014}, month = {2014 Jun}, pages = {207-14}, issn = {0398-7620}, doi = {10.1016/j.respe.2014.03.005}, author = {Riou, C and Fresson, J and Serre, J L and Avillach, P and Leneveut, L and Quantin, C} } @article {292816, title = {Patient-powered research networks: building capacity for conducting patient-centered clinical outcomes research.}, journal = {J Am Med Inform Assoc}, volume = {21}, number = {4}, year = {2014}, month = {2014 Jul-Aug}, pages = {583-6}, abstract = {The Patient-Centered Outcomes Research Institute (PCORI) recently launched PCORnet to establish a single inter-operable multicenter data research network that will support observational research and randomized clinical trials. This paper provides an overview of the patient-powered research networks (PPRNs), networks of patient organizations focused on a particular health condition that are interested in sharing health information and engaging in research. PPRNs will build on their foundation of trust within the patient communities and draw on their expertise, working with participants to identify true patient-centered outcomes and direct a patient-centered research agenda. The PPRNs will overcome common challenges including enrolling a diverse and representative patient population; engaging patients in governance; designing the data infrastructure; sharing data securely while protecting privacy; prioritizing research questions; scaling small networks into a larger network; and identifying pathways to sustainability. PCORnet will be the first distributed research network to bring PCOR to national scale.}, keywords = {Computer Communication Networks, Computer Security, Electronic Health Records, Humans, Outcome Assessment (Health Care), Patient Participation, Patient-Centered Care, United States}, issn = {1527-974X}, doi = {10.1136/amiajnl-2014-002758}, author = {Daugherty, Sarah E and Wahba, Sarita and Fleurence, Rachael} } @article {292831, title = {Signal detection of potentially drug-induced acute liver injury in children using a multi-country healthcare database network.}, journal = {Drug Saf}, volume = {37}, number = {2}, year = {2014}, month = {2014 Feb}, pages = {99-108}, abstract = {BACKGROUND: Data mining in spontaneous reporting databases has shown that drug-induced liver injury is infrequently reported in children. OBJECTIVES: Our objectives were to (i) identify drugs potentially associated with acute liver injury (ALI) in children and adolescents using electronic healthcare record (EHR) data; and (ii) to evaluate the significance and novelty of these associations. METHODS: We identified potential cases of ALI during exposure to any prescribed/dispensed drug for individuals 1 and in the presence of at least three exposed cases of ALI. Potentially new signals were distinguished from already known associations concerning ALI (whether in adults and/or in the paediatric population) through manual review of published literature and drug product labels. RESULTS: The study population comprised 4,838,146 individuals aged , keywords = {Adolescent, Adverse Drug Reaction Reporting Systems, Child, Child Welfare, Child, Preschool, data mining, Databases, Factual, Drug-Induced Liver Injury, Drug-Related Side Effects and Adverse Reactions, Electronic Health Records, European Union, Humans, Infant, Infant, Newborn, International Cooperation, Liver Failure, Acute}, issn = {0114-5916}, doi = {10.1007/s40264-013-0132-9}, author = {Ferrajolo, Carmen and Coloma, Preciosa M and Verhamme, Katia M C and Schuemie, Martijn J and de Bie, Sandra and Gini, Rosa and Herings, Ron and Mazzaglia, Giampiero and Picelli, Gino and Giaquinto, Carlo and Scotti, Lorenza and Avillach, Paul and Pedersen, Lars and Rossi, Francesco and Capuano, Annalisa and van der Lei, Johan and Trifir{\'o}, Gianluca and Sturkenboom, Miriam C J M} } @article {292836, title = {Urinary retinol binding protein is a marker of the extent of interstitial kidney fibrosis.}, journal = {PLoS One}, volume = {9}, number = {1}, year = {2014}, month = {2014}, pages = {e84708}, abstract = {Currently, a non-invasive method to estimate the degree of interstitial fibrosis (IF) in chronic kidney disease is not available in routine. The aim of our study was to evaluate the diagnostic performance of the measurement of urinary low molecular weight (LMW) protein concentrations as a method to determine the extent of IF. The urines specimen from 162 consecutive patients who underwent renal biopsy were used in the analysis. Numerical quantification software based on the colorimetric analysis of fibrous areas was used to assess the percentage IF. Total proteinuria, albuminuria, and the urinary levels of retinol binding protein (RBP), alpha1-microglobulin (α1MG), beta 2-microglobulin (β2MG), transferrin, and IgG immunoglobulins were measured. There was a significant correlation between the degree of IF and the RBP/creatinine (creat) ratio (R2: 0.11, p25\% of the parenchyma was 95\% when using a threshold of 20 mg/g creat. In conclusion, RBP appears to be a quantitative and non-invasive marker for the independent prediction of the extent of kidney IF. Because methods for the measurement of urinary RBP are available in most clinical chemistry departments, RBP measurement is appealing for implementation in the routine care of patients with chronic kidney disease.}, keywords = {Biological Markers, Female, Fibrosis, Glomerular Filtration Rate, Humans, Kidney, Male, Middle Aged, Molecular Weight, Renal Insufficiency, Chronic, Retinol-Binding Proteins}, issn = {1932-6203}, doi = {10.1371/journal.pone.0084708}, author = {Pallet, Nicolas and Chauvet, Sophie and Chass{\'e}, Jean-Fran{\c c}ois and Vincent, Marc and Avillach, Paul and Levi, Charlene and Meas-Yedid, Vannary and Olivo-Marin, Jean-Christophe and Nga-Matsogo, Diane and Beaune, Philippe and Thervet, Eric and Karras, Alexandre} } @article {292881, title = {Design and validation of an automated method to detect known adverse drug reactions in MEDLINE: a contribution from the EU-ADR project.}, journal = {J Am Med Inform Assoc}, volume = {20}, number = {3}, year = {2013}, month = {2013 May 1}, pages = {446-52}, abstract = {OBJECTIVES: The aim of this research was to automate the search of publications concerning adverse drug reactions (ADR) by defining the queries used to search MEDLINE and by determining the required threshold for the number of extracted publications to confirm the drug/event association in the literature. METHODS: We defined an approach based on the medical subject headings (MeSH) {\textquoteright}descriptor records{\textquoteright} and {\textquoteright}supplementary concept records{\textquoteright} thesaurus, using the subheadings {\textquoteright}chemically induced{\textquoteright} and {\textquoteright}adverse effects{\textquoteright} with the {\textquoteright}pharmacological action{\textquoteright} knowledge. An expert-built validation set of true positive and true negative drug/adverse event associations (n=61) was used to validate our method. RESULTS: Using a threshold of three of more extracted publications, the automated search method presented a sensitivity of 90\% and a specificity of 100\%. For nine different drug/event pairs selected, the recall of the automated search ranged from 24\% to 64\% and the precision from 93\% to 48\%. CONCLUSIONS: This work presents a method to find previously established relationships between drugs and adverse events in the literature. Using MEDLINE, following a MeSH approach to filter the signals, is a valid option. Our contribution is available as a web service that will be integrated in the final European EU-ADR project (Exploring and Understanding Adverse Drug Reactions by integrative mining of clinical records and biomedical knowledge) automated system.}, keywords = {Drug-Related Side Effects and Adverse Reactions, Europe, Humans, Information Storage and Retrieval, Internet, Medical Subject Headings, MEDLINE}, issn = {1527-974X}, doi = {10.1136/amiajnl-2012-001083}, author = {Avillach, Paul and Dufour, Jean-Charles and Diallo, Gayo and Salvo, Francesco and Joubert, Michel and Thiessard, Frantz and Mougin, Fleur and Trifir{\`o}, Gianluca and Fourrier-R{\'e}glat, Annie and Pariente, Antoine and Fieschi, Marius} } @article {292876, title = {The EU-ADR Web Platform: delivering advanced pharmacovigilance tools.}, journal = {Pharmacoepidemiol Drug Saf}, volume = {22}, number = {5}, year = {2013}, month = {2013 May}, pages = {459-67}, abstract = {PURPOSE: Pharmacovigilance methods have advanced greatly during the last decades, making post-market drug assessment an essential drug evaluation component. These methods mainly rely on the use of spontaneous reporting systems and health information databases to collect expertise from huge amounts of real-world reports. The EU-ADR Web Platform was built to further facilitate accessing, monitoring and exploring these data, enabling an in-depth analysis of adverse drug reactions risks. METHODS: The EU-ADR Web Platform exploits the wealth of data collected within a large-scale European initiative, the EU-ADR project. Millions of electronic health records, provided by national health agencies, are mined for specific drug events, which are correlated with literature, protein and pathway data, resulting in a rich drug-event dataset. Next, advanced distributed computing methods are tailored to coordinate the execution of data-mining and statistical analysis tasks. This permits obtaining a ranked drug-event list, removing spurious entries and highlighting relationships with high risk potential. RESULTS: The EU-ADR Web Platform is an open workspace for the integrated analysis of pharmacovigilance datasets. Using this software, researchers can access a variety of tools provided by distinct partners in a single centralized environment. Besides performing standalone drug-event assessments, they can also control the pipeline for an improved batch analysis of custom datasets. Drug-event pairs can be substantiated and statistically analysed within the platform{\textquoteright}s innovative working environment. CONCLUSIONS: A pioneering workspace that helps in explaining the biological path of adverse drug reactions was developed within the EU-ADR project consortium. This tool, targeted at the pharmacovigilance community, is available online at https://bioinformatics.ua.pt/euadr/.}, keywords = {Adverse Drug Reaction Reporting Systems, data mining, Databases, Factual, Drug-Related Side Effects and Adverse Reactions, Europe, Humans, Internet, Pharmacovigilance, Software}, issn = {1099-1557}, doi = {10.1002/pds.3375}, author = {Oliveira, Jos{\'e} Luis and Lopes, Pedro and Nunes, Tiago and Campos, David and Boyer, Scott and Ahlberg, Ernst and van Mulligen, Erik M and Kors, Jan A and Singh, Bharat and Furlong, Laura I and Sanz, Ferran and Bauer-Mehren, Anna and Carrascosa, Maria C and Mestres, Jordi and Avillach, Paul and Diallo, Gayo and D{\'\i}az Acedo, Carlos and van der Lei, Johan} } @article {292891, title = {Harmonization process for the identification of medical events in eight European healthcare databases: the experience from the EU-ADR project.}, journal = {J Am Med Inform Assoc}, volume = {20}, number = {1}, year = {2013}, month = {2013 Jan 1}, pages = {184-92}, abstract = {OBJECTIVE: Data from electronic healthcare records (EHR) can be used to monitor drug safety, but in order to compare and pool data from different EHR databases, the extraction of potential adverse events must be harmonized. In this paper, we describe the procedure used for harmonizing the extraction from eight European EHR databases of five events of interest deemed to be important in pharmacovigilance: acute myocardial infarction (AMI); acute renal failure (ARF); anaphylactic shock (AS); bullous eruption (BE); and rhabdomyolysis (RHABD). DESIGN: The participating databases comprise general practitioners{\textquoteright} medical records and claims for hospitalization and other healthcare services. Clinical information is collected using four different disease terminologies and free text in two different languages. The Unified Medical Language System was used to identify concepts and corresponding codes in each terminology. A common database model was used to share and pool data and verify the semantic basis of the event extraction queries. Feedback from the database holders was obtained at various stages to refine the extraction queries. MEASUREMENTS: Standardized and age specific incidence rates (IRs) were calculated to facilitate benchmarking and harmonization of event data extraction across the databases. This was an iterative process. RESULTS: The study population comprised overall 19 647 445 individuals with a follow-up of 59 929 690 person-years (PYs). Age adjusted IRs for the five events of interest across the databases were as follows: (1) AMI: 60-148/100 000 PYs; (2) ARF: 3-49/100 000 PYs; (3) AS: 2-12/100 000 PYs; (4) BE: 2-17/100 000 PYs; and (5) RHABD: 0.1-8/100 000 PYs. CONCLUSIONS: The iterative harmonization process enabled a more homogeneous identification of events across differently structured databases using different coding based algorithms. This workflow can facilitate transparent and reproducible event extractions and understanding of differences between databases.}, keywords = {Adverse Drug Reaction Reporting Systems, Benchmarking, Databases, Factual, Europe, Humans, Incidence, Information Dissemination, Information Storage and Retrieval, International Cooperation, Medical Record Linkage, Medical Records Systems, Computerized, Product Surveillance, Postmarketing, Reference Standards, Unified Medical Language System}, issn = {1527-974X}, doi = {10.1136/amiajnl-2012-000933}, author = {Avillach, Paul and Coloma, Preciosa M and Gini, Rosa and Schuemie, Martijn and Mougin, Fleur and Dufour, Jean-Charles and Mazzaglia, Giampiero and Giaquinto, Carlo and Fornari, Carla and Herings, Ron and Molokhia, Mariam and Pedersen, Lars and Fourrier-R{\'e}glat, Annie and Fieschi, Marius and Sturkenboom, Miriam and van der Lei, Johan and Pariente, Antoine and Trifir{\`o}, Gianluca} } @article {292861, title = {Major regional disparities in outcomes after sudden cardiac arrest during sports.}, journal = {Eur Heart J}, volume = {34}, number = {47}, year = {2013}, month = {2013 Dec}, pages = {3632-40}, abstract = {AIMS: Characteristics of sudden cardiac arrest (SCA) during sports offers a novel (and unexplored) setting to assess factors associated with disparities in outcomes across regions. METHODS AND RESULTS: From a prospective 5-year community-based French registry concerning SCA during sports in 10-75 year-olds, we evaluated whether outcomes differed significantly between geographic regions. We then determined the extent to which variations in community-related early interventions were associated with regional variations in survival. Among 820 SCA cases studied, overall survival at hospital discharge was 15.7\% (95\% confidence interval, 13.2-18.2\%), with considerable regional disparities (from 3.4 to 42.6\%, P < 0.001). Major differences were noted regarding bystander initiation of cardiopulmonary resuscitation (15.3-80.9\%, P < 0.001) and presence of initial shockable rhythm (28.6-79.1\%, P < 0.001), with higher values of these being associated with better survival rates. The proportion of survivors with favourable neurological outcome at discharge was fairly uniform among survival groups (CPC-1/2, varying from 77.4 to 90.0\%, P = 0.83). No difference was observed regarding subjects{\textquoteright} characteristics and circumstances of SCA occurrence, including delays in resuscitation (collapse-to-call period). With a comparable in-hospital mortality (P = 0.44), survival at hospital discharge was highly correlated with that at hospital admission (regional variations from 7.4 to 75.0\%, P < 0.001). CONCLUSION: Major regional disparities exist in survival rates (up to 10-fold) after SCA during sports. SCA cases from regions with the highest levels of bystander resuscitation had the best survival rates to hospital admission and discharge.}, keywords = {Adolescent, Adult, Aged, Cardiopulmonary Resuscitation, Child, Defibrillators, Female, France, Healthcare Disparities, Hospitalization, Humans, Life Support Care, Male, Middle Aged, Out-of-Hospital Cardiac Arrest, Prospective Studies, Registries, Residence Characteristics, Sports, Survival Rate, Young Adult}, issn = {1522-9645}, doi = {10.1093/eurheartj/eht282}, author = {Marijon, Eloi and Bougouin, Wulfran and Celermajer, David S and Perier, Marie-C{\'e}cile and Benameur, Nordine and Lamhaut, Lionel and Karam, Nicole and Dumas, Florence and Tafflet, Muriel and Prugger, Christof and Mustafic, Hazrije and Rifler, Jean-Pierre and Desnos, Michel and Le Heuzey, Jean-Yves and Spaulding, Christian M and Avillach, Paul and Cariou, Alain and Empana, Jean-Philippe and Jouven, Xavier} } @article {292871, title = {A reference standard for evaluation of methods for drug safety signal detection using electronic healthcare record databases.}, journal = {Drug Saf}, volume = {36}, number = {1}, year = {2013}, month = {2013 Jan}, pages = {13-23}, abstract = {BACKGROUND: The growing interest in using electronic healthcare record (EHR) databases for drug safety surveillance has spurred development of new methodologies for signal detection. Although several drugs have been withdrawn postmarketing by regulatory authorities after scientific evaluation of harms and benefits, there is no definitive list of confirmed signals (i.e. list of all known adverse reactions and which drugs can cause them). As there is no true gold standard, prospective evaluation of signal detection methods remains a challenge. OBJECTIVE: Within the context of methods development and evaluation in the EU-ADR Project (Exploring and Understanding Adverse Drug Reactions by integrative mining of clinical records and biomedical knowledge), we propose a surrogate reference standard of drug-adverse event associations based on existing scientific literature and expert opinion. METHODS: The reference standard was constructed for ten top-ranked events judged as important in pharmacovigilance. A stepwise approach was employed to identify which, among a list of drug-event associations, are well recognized (known positive associations) or highly unlikely ({\textquoteright}negative controls{\textquoteright}) based on MEDLINE-indexed publications, drug product labels, spontaneous reports made to the WHO{\textquoteright}s pharmacovigilance database, and expert opinion. Only drugs with adequate exposure in the EU-ADR database network (comprising ≈60 million person-years of healthcare data) to allow detection of an association were considered. Manual verification of positive associations and negative controls was independently performed by two experts proficient in clinical medicine, pharmacoepidemiology and pharmacovigilance. A third expert adjudicated equivocal cases and arbitrated any disagreement between evaluators. RESULTS: Overall, 94 drug-event associations comprised the reference standard, which included 44 positive associations and 50 negative controls for the ten events of interest: bullous eruptions; acute renal failure; anaphylactic shock; acute myocardial infarction; rhabdomyolysis; aplastic anaemia/pancytopenia; neutropenia/agranulocytosis; cardiac valve fibrosis; acute liver injury; and upper gastrointestinal bleeding. For cardiac valve fibrosis, there was no drug with adequate exposure in the database network that satisfied the criteria for a positive association. CONCLUSION: A strategy for the construction of a reference standard to evaluate signal detection methods that use EHR has been proposed. The resulting reference standard is by no means definitive, however, and should be seen as dynamic. As knowledge on drug safety evolves over time and new issues in drug safety arise, this reference standard can be re-evaluated.}, keywords = {Adverse Drug Reaction Reporting Systems, Databases, Factual, Drug-Related Side Effects and Adverse Reactions, Electronic Health Records, Humans, Pharmacovigilance, Product Surveillance, Postmarketing, Reference Standards}, issn = {0114-5916}, doi = {10.1007/s40264-012-0002-x}, author = {Coloma, Preciosa M and Avillach, Paul and Salvo, Francesco and Schuemie, Martijn J and Ferrajolo, Carmen and Pariente, Antoine and Fourrier-R{\'e}glat, Annie and Molokhia, Mariam and Patadia, Vaishali and van der Lei, Johan and Sturkenboom, Miriam and Trifir{\`o}, Gianluca} } @article {292851, title = {Characteristics and outcomes of sudden cardiac arrest during sports in women.}, journal = {Circ Arrhythm Electrophysiol}, volume = {6}, number = {6}, year = {2013}, month = {2013 Dec}, pages = {1185-91}, abstract = {BACKGROUND: No specific data are available on characteristics and outcome of sudden cardiac death (SCD) during sport activities among women in the general population. METHODS AND RESULTS: From a prospective 5-year national survey, involving 820 subjects 10 to 75 years old who presented with SCD (resuscitated or not) during competitive or recreational sport activities, 43 (5.2\%) such events occurred in women, principally during jogging, cycling, and swimming. The level of activity at the time of SCD was moderate to vigorous in 35 cases (81.4\%). The overall incidence of sport-related SCD, among 15- to 75-year-old women, was estimated as 0.59 (95\% confidence interval [CI], 0.39-0.79) to 2.17 (95\% CI, 1.38-2.96) per year per million female sports participants for the 80th and 20th percentiles of reporting districts, respectively. Compared with men, the incidence of SCDs in women was dramatically lower, particularly in the 45- to 54-year range (relative risk, 0.033; 95\% CI, 0.015-0.075). Despite similar circumstances of occurrence, survival at hospital admission (46.5\%; 95\% CI, 31.0-60.0) was significantly higher than that for men (30.0\%; 95\% CI, 26.8-33.2; P=0.02), although this did not reach statistical significance for hospital discharge. Favorable neurological outcomes were similar (80\%). Cause of death seemed less likely to be associated with structural heart disease in women compared with men (58.3\% versus 95.8\%; P=0.003). CONCLUSIONS: Sports-related SCDs in women participants seems dramatically less common (up to 30-fold less frequent) compared with men. Our results also suggest a higher likelihood of successful resuscitation as well as less frequency of structural heart disease in women compared with men.}, keywords = {Adolescent, Adult, Aged, Bicycling, Child, Death, Sudden, Cardiac, Female, Humans, Middle Aged, Prognosis, Registries, Running, Sports, Swimming, Young Adult}, issn = {1941-3084}, doi = {10.1161/CIRCEP.113.000651}, author = {Marijon, Eloi and Bougouin, Wulfran and Celermajer, David S and P{\'e}rier, Marie-C{\'e}cile and Dumas, Florence and Benameur, Nordine and Karam, Nicole and Lamhaut, Lionel and Tafflet, Muriel and Mustafic, Hazrije and de Deus, Natalia Machado and Le Heuzey, Jean-Yves and Desnos, Michel and Avillach, Paul and Spaulding, Christian and Cariou, Alain and Prugger, Christof and Empana, Jean-Philippe and Jouven, Xavier} } @article {292856, title = {Drug-induced acute myocardial infarction: identifying {\textquoteright}prime suspects{\textquoteright} from electronic healthcare records-based surveillance system.}, journal = {PLoS One}, volume = {8}, number = {8}, year = {2013}, month = {2013}, pages = {e72148}, abstract = {BACKGROUND: Drug-related adverse events remain an important cause of morbidity and mortality and impose huge burden on healthcare costs. Routinely collected electronic healthcare data give a good snapshot of how drugs are being used in {\textquoteright}real-world{\textquoteright} settings. OBJECTIVE: To describe a strategy that identifies potentially drug-induced acute myocardial infarction (AMI) from a large international healthcare data network. METHODS: Post-marketing safety surveillance was conducted in seven population-based healthcare databases in three countries (Denmark, Italy, and the Netherlands) using anonymised demographic, clinical, and prescription/dispensing data representing 21,171,291 individuals with 154,474,063 person-years of follow-up in the period 1996-2010. Primary care physicians{\textquoteright} medical records and administrative claims containing reimbursements for filled prescriptions, laboratory tests, and hospitalisations were evaluated using a three-tier triage system of detection, filtering, and substantiation that generated a list of drugs potentially associated with AMI. Outcome of interest was statistically significant increased risk of AMI during drug exposure that has not been previously described in current literature and is biologically plausible. RESULTS: Overall, 163 drugs were identified to be associated with increased risk of AMI during preliminary screening. Of these, 124 drugs were eliminated after adjustment for possible bias and confounding. With subsequent application of criteria for novelty and biological plausibility, association with AMI remained for nine drugs ({\textquoteright}prime suspects{\textquoteright}): azithromycin; erythromycin; roxithromycin; metoclopramide; cisapride; domperidone; betamethasone; fluconazole; and megestrol acetate. LIMITATIONS: Although global health status, co-morbidities, and time-invariant factors were adjusted for, residual confounding cannot be ruled out. CONCLUSION: A strategy to identify potentially drug-induced AMI from electronic healthcare data has been proposed that takes into account not only statistical association, but also public health relevance, novelty, and biological plausibility. Although this strategy needs to be further evaluated using other healthcare data sources, the list of {\textquoteright}prime suspects{\textquoteright} makes a good starting point for further clinical, laboratory, and epidemiologic investigation.}, keywords = {Acute Disease, Adverse Drug Reaction Reporting Systems, Azithromycin, Betamethasone, Cisapride, Domperidone, Electronic Health Records, Fluconazole, Humans, Megestrol Acetate, Metoclopramide, Myocardial Infarction}, issn = {1932-6203}, doi = {10.1371/journal.pone.0072148}, author = {Coloma, Preciosa M and Schuemie, Martijn J and Trifir{\`o}, Gianluca and Furlong, Laura and van Mulligen, Erik and Bauer-Mehren, Anna and Avillach, Paul and Kors, Jan and Sanz, Ferran and Mestres, Jordi and Oliveira, Jos{\'e} Luis and Boyer, Scott and Helgee, Ernst Ahlberg and Molokhia, Mariam and Matthews, Justin and Prieto-Merino, David and Gini, Rosa and Herings, Ron and Mazzaglia, Giampiero and Picelli, Gino and Scotti, Lorenza and Pedersen, Lars and van der Lei, Johan and Sturkenboom, Miriam} } @article {292846, title = {Gathering and exploring scientific knowledge in pharmacovigilance.}, journal = {PLoS One}, volume = {8}, number = {12}, year = {2013}, month = {2013}, pages = {e83016}, abstract = {Pharmacovigilance plays a key role in the healthcare domain through the assessment, monitoring and discovery of interactions amongst drugs and their effects in the human organism. However, technological advances in this field have been slowing down over the last decade due to miscellaneous legal, ethical and methodological constraints. Pharmaceutical companies started to realize that collaborative and integrative approaches boost current drug research and development processes. Hence, new strategies are required to connect researchers, datasets, biomedical knowledge and analysis algorithms, allowing them to fully exploit the true value behind state-of-the-art pharmacovigilance efforts. This manuscript introduces a new platform directed towards pharmacovigilance knowledge providers. This system, based on a service-oriented architecture, adopts a plugin-based approach to solve fundamental pharmacovigilance software challenges. With the wealth of collected clinical and pharmaceutical data, it is now possible to connect knowledge providers{\textquoteright} analysis and exploration algorithms with real data. As a result, new strategies allow a faster identification of high-risk interactions between marketed drugs and adverse events, and enable the automated uncovering of scientific evidence behind them. With this architecture, the pharmacovigilance field has a new platform to coordinate large-scale drug evaluation efforts in a unique ecosystem, publicly available at http://bioinformatics.ua.pt/euadr/.}, issn = {1932-6203}, doi = {10.1371/journal.pone.0083016}, author = {Lopes, Pedro and Nunes, Tiago and Campos, David and Furlong, Laura Ines and Bauer-Mehren, Anna and Sanz, Ferran and Carrascosa, Maria Carmen and Mestres, Jordi and Kors, Jan and Singh, Bharat and van Mulligen, Erik and van der Lei, Johan and Diallo, Gayo and Avillach, Paul and Ahlberg, Ernst and Boyer, Scott and Diaz, Carlos and Oliveira, Jos{\'e} Lu{\'\i}s} } @article {292841, title = {Phenome-wide association studies on a quantitative trait: application to TPMT enzyme activity and thiopurine therapy in pharmacogenomics.}, journal = {PLoS Comput Biol}, volume = {9}, number = {12}, year = {2013}, month = {2013}, pages = {e1003405}, abstract = {Phenome-Wide Association Studies (PheWAS) investigate whether genetic polymorphisms associated with a phenotype are also associated with other diagnoses. In this study, we have developed new methods to perform a PheWAS based on ICD-10 codes and biological test results, and to use a quantitative trait as the selection criterion. We tested our approach on thiopurine S-methyltransferase (TPMT) activity in patients treated by thiopurine drugs. We developed 2 aggregation methods for the ICD-10 codes: an ICD-10 hierarchy and a mapping to existing ICD-9-CM based PheWAS codes. Eleven biological test results were also analyzed using discretization algorithms. We applied these methods in patients having a TPMT activity assessment from the clinical data warehouse of a French academic hospital between January 2000 and July 2013. Data after initiation of thiopurine treatment were analyzed and patient groups were compared according to their TPMT activity level. A total of 442 patient records were analyzed representing 10,252 ICD-10 codes and 72,711 biological test results. The results from the ICD-9-CM based PheWAS codes and ICD-10 hierarchy codes were concordant. Cross-validation with the biological test results allowed us to validate the ICD phenotypes. Iron-deficiency anemia and diabetes mellitus were associated with a very high TPMT activity (p = 0.0004 and p = 0.0015, respectively). We describe here an original method to perform PheWAS on a quantitative trait using both ICD-10 diagnosis codes and biological test results to identify associated phenotypes. In the field of pharmacogenomics, PheWAS allow for the identification of new subgroups of patients who require personalized clinical and therapeutic management.}, keywords = {Genome-Wide Association Study, Humans, International Classification of Diseases, Methyltransferases, Pharmacogenetics, Phenotype, Purines, Quantitative Trait Loci}, issn = {1553-7358}, doi = {10.1371/journal.pcbi.1003405}, author = {Neuraz, Antoine and Chouchana, Laurent and Malamut, Georgia and Le Beller, Christine and Roche, Denis and Beaune, Philippe and Degoulet, Patrice and Burgun, Anita and Loriot, Marie-Anne and Avillach, Paul} } @article {356031, title = {A report on the large measles outbreak in Lyon, France, 2010 to 2011.}, journal = {Euro Surveill}, volume = {17}, number = {36}, year = {2012}, month = {2012}, pages = {20264}, abstract = {In 2010 and 2011, the city of Lyon, located in the Rh{\^o}ne-Alpes region (France), has experienced one of the highest incidences of measles in Europe. We describe a measles outbreak in the Lyon area, where cases were diagnosed at Lyon University hospitals (LUH) between 2010 and mid-2011. Data were collected from the mandatory notification system of the regional public health agency, and from the virology department of the LUH. All patients and healthcare workers who had contracted measles were included. Overall, 407 cases were diagnosed, with children of less than one year of age accounting for the highest proportion (n=129, 32\%), followed by individuals between 17 and 29 years-old (n=126, 31\%). Of the total cases, 72 (18\%) had complications. The proportions of patients and healthcare workers who were not immune to measles were higher among those aged up to 30 years. Consequently, women of childbearing age constituted a specific population at high risk to contract measles and during this outbreak, 13 cases of measles, seven under 30 years-old, were identified among pregnant women. This study highlights the importance of being vaccinated with two doses of measles vaccine, the only measure which could prevent and allow elimination of the disease.}, keywords = {Adolescent, Adult, Age Distribution, Aged, Child, Child, Preschool, Disease Outbreaks, Female, France, Health Personnel, Hospitals, University, Humans, Incidence, Infant, Male, Mandatory Reporting, Measles, Middle Aged, Population Surveillance, Pregnancy, Pregnancy Complications, Infectious, Prospective Studies, Sex Distribution, Vaccination, Young Adult}, issn = {1560-7917}, author = {Huoi, C and Casalegno, J S and B{\'e}net, T and Neuraz, A and Billaud, G and Eibach, D and Mekki, Y and Rudigoz, R and Massardier, J and Huissoud, C and Massoud, M and Gaucherand, P and Claris, O and Gillet, Y and Floret, D and Lina, B and Vanhems, P} } @article {292901, title = {Automatic filtering and substantiation of drug safety signals.}, journal = {PLoS Comput Biol}, volume = {8}, number = {4}, year = {2012}, month = {2012}, pages = {e1002457}, abstract = {Drug safety issues pose serious health threats to the population and constitute a major cause of mortality worldwide. Due to the prominent implications to both public health and the pharmaceutical industry, it is of great importance to unravel the molecular mechanisms by which an adverse drug reaction can be potentially elicited. These mechanisms can be investigated by placing the pharmaco-epidemiologically detected adverse drug reaction in an information-rich context and by exploiting all currently available biomedical knowledge to substantiate it. We present a computational framework for the biological annotation of potential adverse drug reactions. First, the proposed framework investigates previous evidences on the drug-event association in the context of biomedical literature (signal filtering). Then, it seeks to provide a biological explanation (signal substantiation) by exploring mechanistic connections that might explain why a drug produces a specific adverse reaction. The mechanistic connections include the activity of the drug, related compounds and drug metabolites on protein targets, the association of protein targets to clinical events, and the annotation of proteins (both protein targets and proteins associated with clinical events) to biological pathways. Hence, the workflows for signal filtering and substantiation integrate modules for literature and database mining, in silico drug-target profiling, and analyses based on gene-disease networks and biological pathways. Application examples of these workflows carried out on selected cases of drug safety signals are discussed. The methodology and workflows presented offer a novel approach to explore the molecular mechanisms underlying adverse drug reactions.}, keywords = {Computer Simulation, Database Management Systems, Databases, Factual, Documentation, Drug-Related Side Effects and Adverse Reactions, Humans, Information Storage and Retrieval, Models, Biological, Registries}, issn = {1553-7358}, doi = {10.1371/journal.pcbi.1002457}, author = {Bauer-Mehren, Anna and van Mullingen, Erik M and Avillach, Paul and Carrascosa, Mar{\'\i}a Del Carmen and Garcia-Serna, Ricard and Pi{\~n}ero, Janet and Singh, Bharat and Lopes, Pedro and Oliveira, Jos{\'e} L and Diallo, Gayo and Helgee, Ernst Ahlberg and Boyer, Scott and Mestres, Jordi and Sanz, Ferran and Kors, Jan A and Furlong, Laura I} } @article {292886, title = {Effect of competition bias in safety signal generation: analysis of a research database of spontaneous reports in France.}, journal = {Drug Saf}, volume = {35}, number = {10}, year = {2012}, month = {2012 Oct 1}, pages = {855-64}, abstract = {BACKGROUND: Automated disproportionality analysis of spontaneous reporting is increasingly used routinely. It can theoretically be influenced by a competition bias for signal detection owing to the presence of reports related to well-established drug-event associations. OBJECTIVE: The aim of the study was to explore the effects of competition bias on safety signals generated from a large spontaneous reporting research database. METHODS: Using the case/non-case approach in the French spontaneous reporting research database, which includes data of reporting in France from January 1986 to December 2001, the effects of the competition bias were explored by generating safety signals associated with six events of interest (gastric and oesophageal haemorrhages, central nervous system haemorrhage and cerebrovascular accidents, ischaemic coronary disorders, migraine headaches, muscle pains, and hepatic enzymes and function abnormalities) before and after removing from the database reports relating to drugs known to be strongly associated with these events, whether they constituted cases or non-cases. As this study was performed on a closed database (last data entered 31 December 2001), potential signals unmasked by removal were considered as real signals if no or only incomplete knowledge about the association was available from the literature before 1 January 2002. RESULTS: For gastric and oesophageal haemorrhages, after removing reports involving antithrombotic agents or NSAIDs, three potential signals were unmasked (prednisone, rivastigmine and isotretinoin). For central nervous system haemorrhage and cerebrovascular accidents, after removing reports involving antithrombotic agents, three potential signals were unmasked (ethinylestradiol, interferon-α-2B and methylprednisolone). For ischaemic coronary disorders, after removing reports involving anthracyclines, bleomycine, anti-HIV drugs or triptans, one potential signal was unmasked (ondansetron). For migraine headaches, after removing reports involving nitrates, calcium channel blockers, opioid analgesics or intravenous immunoglobulins, six potential signals were unmasked (ammonium chloride, leflunomide, milnacipran, montelukast, proguanil and pyridostigmine). For muscle pains, after removing reports involving statins or fibrates, seven potential signals were unmasked (hydroxychloroquine, lactulose, levodopa in combination with dopadecarboxylase inhibitor, nevirapine, nomegestrol, ritonavir and stavudine). Finally, for hepatic enzymes and function abnormalities, after removing reports involving NSAIDs, anilides, antituberculosis drugs, antiepileptics, ketoconazole, tacrine, or amineptine, two potential signals were unmasked (caffeine, metformin). Of all these unmasked potential signals, ten appeared non/incompletely documented as at 1 January 2002 and were considered as real signals, with three of these later being confirmed by the literature and finally considered as true positives (isotretinoin, methylprednisolone and milnacipran). CONCLUSION: This study confirms that a competition bias can occur when performing safety signal generation in spontaneous reporting databases. The minimization of this bias could lead to previously masked signals being revealed.}, keywords = {Adverse Drug Reaction Reporting Systems, Bias (Epidemiology), data mining, Databases, Factual, Drug-Related Side Effects and Adverse Reactions, France, Humans}, issn = {0114-5916}, doi = {10.2165/11631780-000000000-00000}, author = {Pariente, Antoine and Avillach, Paul and Salvo, Francesco and Thiessard, Frantz and Miremont-Salam{\'e}, Ghada and Fourrier-Reglat, Annie and Haramburu, Fran{\c c}oise and B{\'e}gaud, Bernard and Moore, Nicholas} } @article {292896, title = {Risk factors and clinical outcome of unsuspected pulmonary embolism in cancer patients: a case-control study.}, journal = {J Thromb Haemost}, volume = {10}, number = {10}, year = {2012}, month = {2012 Oct}, pages = {2032-8}, abstract = {BACKGROUND: Little is known about the risk factors and outcome of unsuspected pulmonary embolism (UPE) in cancer patients. OBJECTIVES: To assess the risk factors and outcome of UPE in cancer patients. METHODS: The charts of 66 patients diagnosed with UPE were reviewed. Two control groups were selected: 132 cancer patients without pulmonary embolism (PE) and 65 cancer patients with clinically suspected PE. Variables associated with UPE were identified by multivariable analysis. Six-month survival and recurrent venous thromboembolism were compared by use of Cox proportional analysis. RESULTS: Twenty-seven (40.9\%) patients with UPE had symptoms suggesting PE. Adenocarcinoma (odds ratio [OR] 4.45; 95\% confidence interval [CI] 1.98-9.97), advanced age (OR 1.18; 95\% CI 1.02-1.38), recent chemotherapy (OR 4.62; 95\% CI 2.26-9.44), performance status > 2 (OR 7.31; 95\% CI 1.90-28.15) and previous venous thromboembolism (OR 4.47; 95\% CI 1.16-17.13) were associated with UPE. When adjusted for tumor stage and performance status, 6-month mortality did not differ between patients with UPE and patients without PE (hazard ratio 1.40; 95\% CI 0.53-3.66; P = 0.50). Patients with UPE were more likely to have central venous catheters and chemotherapy and less likely to have proximal clots than patients with clinically suspected PE. Recurrent venous thromboembolism occurred in 6.1\% and 7.7\% of patients with UPE and symptomatic PE, respectively. CONCLUSION: UPE is not associated with an increased risk of death. Patients with clinically suspected PE and those with UPE have similar risks of recurrent venous thromboembolism.}, keywords = {Adenocarcinoma, Age Factors, Aged, Anticoagulants, Antineoplastic Agents, Catheterization, Central Venous, Chi-Square Distribution, Female, Humans, Kaplan-Meier Estimate, Karnofsky Performance Status, Logistic Models, Male, Middle Aged, Multidetector Computed Tomography, Multivariate Analysis, Odds Ratio, Predictive Value of Tests, Prognosis, Proportional Hazards Models, Pulmonary Embolism, Recurrence, Retrospective Studies, Risk Assessment, Risk Factors, Time Factors, Venous Thromboembolism}, issn = {1538-7836}, doi = {10.1111/j.1538-7836.2012.04868.x}, author = {Sahut D{\textquoteright}Izarn, M and Caumont Prim, A and Planquette, B and Revel, M P and Avillach, P and Chatellier, G and Sanchez, O and Meyer, G} } @article {292906, title = {EU-ADR healthcare database network vs. spontaneous reporting system database: preliminary comparison of signal detection.}, journal = {Stud Health Technol Inform}, volume = {166}, year = {2011}, month = {2011}, pages = {25-30}, abstract = {The EU-ADR project aims to exploit different European electronic healthcare records (EHR) databases for drug safety signal detection. In this paper we report the preliminary results concerning the comparison of signal detection between EU-ADR network and two spontaneous reporting databases, the Food and Drug Administration and World Health Organization databases. EU-ADR data sources consist of eight databases in four countries (Denmark, Italy, Netherlands, and United Kingdom) that are virtually linked through distributed data network. A custom-built software (Jerboa{\textcopyright}) elaborates harmonized input data that are produced locally and generates aggregated data which are then stored in a central repository. Those data are subsequently analyzed through different statistics (i.e. Longitudinal Gamma Poisson Shrinker). As potential signals, all the drugs that are associated to six events of interest (bullous eruptions - BE, acute renal failure - ARF, acute myocardial infarction - AMI, anaphylactic shock - AS, rhabdomyolysis - RHABD, and upper gastrointestinal bleeding - UGIB) have been detected via different data mining techniques in the two systems. Subsequently a comparison concerning the number of drugs that could be investigated and the potential signals detected for each event in the spontaneous reporting systems (SRSs) and EU-ADR network was made. SRSs could explore, as potential signals, a larger number of drugs for the six events, in comparison to EU-ADR (range: 630-3,393 vs. 87-856), particularly for those events commonly thought to be potentially drug-induced (i.e. BE: 3,393 vs. 228). The highest proportion of signals detected in SRSs was found for BE, ARF and AS, while for ARF, and UGIB in EU-ADR. In conclusion, it seems that EU-ADR longitudinal database network may complement traditional spontaneous reporting system for signal detection, especially for those adverse events that are frequent in general population and are not commonly thought to be drug-induced. The methodology for signal detection in EU-ADR is still under development and testing phase.}, keywords = {Adverse Drug Reaction Reporting Systems, data mining, Databases, Factual, Drug-Related Side Effects and Adverse Reactions, Europe, Humans, Medical Records Systems, Computerized, United States, United States Food and Drug Administration, World Health Organization}, issn = {0926-9630}, author = {Trifir{\`o}, Gianluca and Patadia, Vaishali and Schuemie, Martijn J and Coloma, Preciosa M and Gini, Rosa and Herings, Ron and Hippisley-Cox, Julia and Mazzaglia, Giampiero and Giaquinto, Carlo and Scotti, Lorenza and Pedersen, Lars and Avillach, Paul and Sturkenboom, Miriam C J M and van der Lei, Johan and van der Lei, Johan} } @article {356691, title = {Evolutionary characters, phenotypes and ontologies: curating data from the systematic biology literature.}, journal = {PLoS One}, volume = {5}, number = {5}, year = {2010}, month = {2010}, pages = {e10708}, abstract = {BACKGROUND: The wealth of phenotypic descriptions documented in the published articles, monographs, and dissertations of phylogenetic systematics is traditionally reported in a free-text format, and it is therefore largely inaccessible for linkage to biological databases for genetics, development, and phenotypes, and difficult to manage for large-scale integrative work. The Phenoscape project aims to represent these complex and detailed descriptions with rich and formal semantics that are amenable to computation and integration with phenotype data from other fields of biology. This entails reconceptualizing the traditional free-text characters into the computable Entity-Quality (EQ) formalism using ontologies. METHODOLOGY/PRINCIPAL FINDINGS: We used ontologies and the EQ formalism to curate a collection of 47 phylogenetic studies on ostariophysan fishes (including catfishes, characins, minnows, knifefishes) and their relatives with the goal of integrating these complex phenotype descriptions with information from an existing model organism database (zebrafish, http://zfin.org). We developed a curation workflow for the collection of character, taxonomic and specimen data from these publications. A total of 4,617 phenotypic characters (10,512 states) for 3,449 taxa, primarily species, were curated into EQ formalism (for a total of 12,861 EQ statements) using anatomical and taxonomic terms from teleost-specific ontologies (Teleost Anatomy Ontology and Teleost Taxonomy Ontology) in combination with terms from a quality ontology (Phenotype and Trait Ontology). Standards and guidelines for consistently and accurately representing phenotypes were developed in response to the challenges that were evident from two annotation experiments and from feedback from curators. CONCLUSIONS/SIGNIFICANCE: The challenges we encountered and many of the curation standards and methods for improving consistency that we developed are generally applicable to any effort to represent phenotypes using ontologies. This is because an ontological representation of the detailed variations in phenotype, whether between mutant or wildtype, among individual humans, or across the diversity of species, requires a process by which a precise combination of terms from domain ontologies are selected and organized according to logical relations. The efficiencies that we have developed in this process will be useful for any attempt to annotate complex phenotypic descriptions using ontologies. We also discuss some ramifications of EQ representation for the domain of systematics.}, keywords = {Animals, Biological Evolution, Computational Biology, Databases, Genetic, Fishes, Phenotype, Publications, Systems Biology}, issn = {1932-6203}, doi = {10.1371/journal.pone.0010708}, author = {Dahdul, Wasila M and Balhoff, James P and Engeman, Jeffrey and Grande, Terry and Hilton, Eric J and Kothari, Cartik and Lapp, Hilmar and Lundberg, John G and Midford, Peter E and Vision, Todd J and Westerfield, Monte and Mabee, Paula M} } @article {356651, title = {Phenex: ontological annotation of phenotypic diversity.}, journal = {PLoS One}, volume = {5}, number = {5}, year = {2010}, month = {2010}, pages = {e10500}, abstract = {BACKGROUND: Phenotypic differences among species have long been systematically itemized and described by biologists in the process of investigating phylogenetic relationships and trait evolution. Traditionally, these descriptions have been expressed in natural language within the context of individual journal publications or monographs. As such, this rich store of phenotype data has been largely unavailable for statistical and computational comparisons across studies or integration with other biological knowledge. METHODOLOGY/PRINCIPAL FINDINGS: Here we describe Phenex, a platform-independent desktop application designed to facilitate efficient and consistent annotation of phenotypic similarities and differences using Entity-Quality syntax, drawing on terms from community ontologies for anatomical entities, phenotypic qualities, and taxonomic names. Phenex can be configured to load only those ontologies pertinent to a taxonomic group of interest. The graphical user interface was optimized for evolutionary biologists accustomed to working with lists of taxa, characters, character states, and character-by-taxon matrices. CONCLUSIONS/SIGNIFICANCE: Annotation of phenotypic data using ontologies and globally unique taxonomic identifiers will allow biologists to integrate phenotypic data from different organisms and studies, leveraging decades of work in systematics and comparative morphology.}, keywords = {Biodiversity, Biological Evolution, Computational Biology, Internet, Phenotype, Semantics, Software}, issn = {1932-6203}, doi = {10.1371/journal.pone.0010500}, author = {Balhoff, James P and Dahdul, Wasila M and Kothari, Cartik R and Lapp, Hilmar and Lundberg, John G and Mabee, Paula and Midford, Peter E and Westerfield, Monte and Vision, Todd J} } @article {292916, title = {Design and evaluation of a semantic approach for the homogeneous identification of events in eight patient databases: a contribution to the European EU-ADR project.}, journal = {Stud Health Technol Inform}, volume = {160}, number = {Pt 2}, year = {2010}, month = {2010}, pages = {1085-9}, abstract = {The overall objective of the EU-ADR project is the design, development, and validation of a computerised system that exploits data from electronic health records and biomedical databases for the early detection of adverse drug reactions. Eight different databases, containing health records of more than 30 million European citizens, are involved in the project. Unique queries cannot be performed across different databases because of their heterogeneity: Medical record and Claims databases, four different terminologies for coding diagnoses, and two languages for the information described in free text. The aim of our study was to provide database owners with a common basis for the construction of their queries. Using the UMLS, we provided a list of medical concepts, with their corresponding terms and codes in the four terminologies, which should be considered to retrieve the relevant information for the events of interest from the databases.}, keywords = {Databases, Factual, Electronic Health Records, Humans, Medical Records, Semantics, Terminology as Topic, Unified Medical Language System}, issn = {0926-9630}, author = {Avillach, Paul and Joubert, Michel and Thiessard, Frantz and Trifir{\`o}, Gianluca and Dufour, Jean-Charles and Pariente, Antoine and Mougin, Fleur and Polimeni, Giovanni and Catania, Maria Antonietta and Giaquinto, Carlo and Mazzaglia, Giampiero and Fornari, Carla and Herings, Ron and Gini, Rosa and Hippisley-Cox, Julia and Molokhia, Mariam and Pedersen, Lars and Fourrier-R{\'e}glat, Annie and Sturkenboom, Miriam and Fieschi, Marius} } @article {292911, title = {A potential competition bias in the detection of safety signals from spontaneous reporting databases.}, journal = {Pharmacoepidemiol Drug Saf}, volume = {19}, number = {11}, year = {2010}, month = {2010 Nov}, pages = {1166-71}, abstract = {PURPOSE: To study whether reports related to known drug-event associations could hinder the detection of new signals by increasing the detection thresholds when using disporportionality analyses in spontaneous reporting (SR) databases. METHODS: The French SR database (2005-2006 data) was used to test this hypothesis for the following events: bleeding, headache, hepatitis, myalgia, myocardial infarction, stroke, and toxic epidermal necrolysis (TEN). For each of these, using the Proportional Reporting Ratio (PRR) and the Reporting Odds Ratio (ROR), the number of cases needed to trigger a signal out of 50, 100, and 200 reports for a hypothetical newly introduced drug were computed before and after removing from the database reports involving drugs known to be associated with the event. RESULTS: For bleeding and stroke, removing potentially competitive data resulted in a decrease of the number of cases needed to trigger a signal for a newly introduced drug for both PRR and ROR (e.g., from 9 to 4, and 5 to 3 cases out of 50 reports for bleeding and stroke, respectively using the PRR). They were not or only slightly modified for the other studied events. CONCLUSIONS: Removing reports related to known drug-event associations could increase the sensitivity of signal detection in SR databases. This should be considered when using SR databases for signal detection as it could result in earlier identification of new drug-event associations.}, keywords = {Adverse Drug Reaction Reporting Systems, Bias (Epidemiology), Data Interpretation, Statistical, Databases, Factual, Drug-Related Side Effects and Adverse Reactions, France, Humans, Odds Ratio}, issn = {1099-1557}, doi = {10.1002/pds.2022}, author = {Pariente, Antoine and Didailler, Marie and Avillach, Paul and Miremont-Salam{\'e}, Ghada and Fourrier-Reglat, Annie and Haramburu, Fran{\c c}oise and Moore, Nicholas} } @article {292921, title = {A semantic approach for the homogeneous identification of events in eight patient databases: a contribution to the European eu-ADR project.}, journal = {Stud Health Technol Inform}, volume = {150}, year = {2009}, month = {2009}, pages = {190-4}, abstract = {The overall objective of the eu-ADR project is the design, development, and validation of a computerised system that exploits data from electronic health records and biomedical databases for the early detection of adverse drug reactions. Eight different databases, containing health records of more than 30 million European citizens, are involved in the project. Unique queries cannot be performed across different databases because of their heterogeneity: Medical record and Claims databases, four different terminologies for coding diagnoses, and two languages for the information described in free text. The aim of our study was to provide database owners with a common basis for the construction of their queries. Using the UMLS, we provided a list of medical concepts, with their corresponding terms and codes in the four terminologies, which should be considered to retrieve the relevant information for the events of interest from the databases.}, keywords = {Databases, Factual, Europe, Information Storage and Retrieval, Medical Records Systems, Computerized, Semantics, Terminology as Topic, Unified Medical Language System}, issn = {0926-9630}, author = {Avillach, Paul and Mougin, Fleur and Joubert, Michel and Thiessard, Frantz and Pariente, Antoine and Dufour, Jean-Charles and Trifir{\`o}, Gianluca and Polimeni, Giovanni and Catania, Maria Antonietta and Giaquinto, Carlo and Mazzaglia, Giampiero and Baio, Gianluca and Herings, Ron and Gini, Rosa and Hippisley-Cox, Julia and Molokhia, Mariam and Pedersen, Lars and Fourrier-R{\'e}glat, Annie and Sturkenboom, Miriam and Fieschi, Marius} } @article {292926, title = {Using discharge abstracts to evaluate a regional perinatal network: assessment of the linkage procedure of anonymous data.}, journal = {Int J Telemed Appl}, volume = {2009}, year = {2009}, month = {2009}, pages = {181842}, abstract = {To assess the Burgundy perinatal network (18 obstetrical units; 18 500 births per year), discharge abstracts and additional data were collected for all mothers and newborns. In accordance with French law, data were rendered anonymous before statistical analysis, and were linked to patients using a specific procedure. This procedure allowed data concerning each mother to be linked to those for her newborn(s). This study showed that all mothers and newborns were included in the regional database; the data for all mothers were linked to those for their infant(s) in all cases. Additional data (gestational age) were obtained for 99.9\% of newborns.}, issn = {1687-6415}, doi = {10.1155/2009/181842}, author = {Quantin, Catherine and Gouyon, B{\'e}atrice and Avillach, Paul and Ferdynus, Cyril and Sagot, Paul and Gouyon, Jean-Bernard} } @article {292946, title = {Building application-related patient identifiers: what solution for a European country?}, journal = {Int J Telemed Appl}, year = {2008}, month = {2008}, pages = {678302}, abstract = {We propose a method utilizing a derived social security number with the same reliability as the social security number. We show the anonymity techniques classically based on unidirectional hash functions (such as the secure hash algorithm (SHA-2) function that can guarantee the security, quality, and reliability of information if these techniques are applied to the Social Security Number). Hashing produces a strictly anonymous code that is always the same for a given individual, and thus enables patient data to be linked. Different solutions are developed and proposed in this article. Hashing the social security number will make it possible to link the information in the personal medical file to other national health information sources with the aim of completing or validating the personal medical record or conducting epidemiological and clinical research. This data linkage would meet the anonymous data requirements of the European directive on data protection.}, issn = {1687-6415}, doi = {10.1155/2008/678302}, author = {Quantin, Catherine and Allaert, Fran{\c c}ois-Andr{\'e} and Avillach, Paul and Fassa, Maniane and Riandey, Beno{\^\i}t and Trouessin, Gilles and Cohen, Olivier} } @article {292936, title = {Improving the quality of the coding of primary diagnosis in standardized discharge summaries.}, journal = {Health Care Manag Sci}, volume = {11}, number = {2}, year = {2008}, month = {2008 Jun}, pages = {147-51}, abstract = {We propose to design and test an information-processing model to participate in appraising the quality and the consistency of the coding, for billing, of Standardized Discharge Summaries (SDSs). We designed a model using both symbolic knowledge extracted from the NLM{\textquoteright}s UMLS and statistical knowledge. The aim is to retrieve from the ICD-10 terms recorded in a SDS the Principal Diagnosis (PD) at the time of coding. In 90\% of cases the PD was retrieved 1st or 2nd in SDS including three ICD-10 codes or more. This model could contribute as part of an automated quality control process in a hospital information system by checking consistency in coded SDSs and improve the income of the hospital.}, keywords = {Continuity of Patient Care, Hospital Administration, Humans, Information Systems, International Classification of Diseases, Medical Records Systems, Computerized, Patient Discharge, Quality of Health Care, Reproducibility of Results, Terminology as Topic, Vocabulary, Controlled}, issn = {1386-9620}, author = {Avillach, Paul and Joubert, Michel and Fieschi, Marius} } @article {292941, title = {Using knowledge for indexing health web resources in a quality-controlled gateway.}, journal = {Stud Health Technol Inform}, volume = {136}, year = {2008}, month = {2008}, pages = {205-10}, abstract = {OBJECTIVES: The aim of this study is to provide to indexers MeSH terms to be considered as major ones in a list of terms automatically extracted from a document. MATERIAL AND METHODS: We propose a method combining symbolic knowledge - the UMLS Metathesaurus and Semantic Network - and statistical knowledge drawn from co-occurrences of terms in the CISMeF database (a French-language quality-controlled health gateway) using data mining measures. The method was tested on CISMeF corpus of 293 resources. RESULTS: There was a proportion of 0.37+/-0.26 major terms in the processed records. The method produced lists of terms with a proportion of terms initially pointed out as major of 0.54+/-0.31. DISCUSSION: The method we propose reduces the number of terms, which seem not useful for content description of resources, such as "check tags", but retains the most descriptive ones. Discarding these terms is accounted for by: 1) the removal by using semantic knowledge of associations of concepts bearing no real medical significance, 2) the removal by using statistical knowledge of nonstatistically significant associations of terms. CONCLUSION: This method can assist effectively indexers in their daily work and will be soon applied in the CISMeF system.}, keywords = {Databases, Bibliographic, France, Humans, Internet, Knowledge Bases, Medical Subject Headings, Semantics, Unified Medical Language System, Vocabulary, Controlled}, issn = {0926-9630}, author = {Joubert, Michel and Darmoni, Stefan J and Avillach, Paul and Dahamna, Badisse and Fieschi, Marius} } @article {292961, title = {How to manage secure direct access of European patients to their computerized medical record and personal medical record.}, journal = {Stud Health Technol Inform}, volume = {127}, year = {2007}, month = {2007}, pages = {246-55}, abstract = {The multiplication of the requests of the patients for a direct access to their Medical Record (MR), the development of Personal Medical Record (PMR) supervised by the patients themselves, the increasing development of the patients{\textquoteright} electronic medical records (EMRs) and the world wide internet utilization will lead to envisage an access by using technical automatic and scientific way. It will require the addition of different conditions: a unique patient identifier which could base on a familial component in order to get access to the right record anywhere in Europe, very strict identity checks using cryptographic techniques such as those for the electronic signature, which will ensure the authentication of the requests sender and the integrity of the file but also the protection of the confidentiality and the access follow up. The electronic medical record must also be electronically signed by the practitioner in order to get evidence that he has given his agreement and taken the liability for that. This electronic signature also avoids any kind of post-transmission falsification. This will become extremely important, especially in France where patients will have the possibility to mask information that, they do not want to appear in their personal medical record. Currently, the idea of every citizen having electronic signatures available appears positively Utopian. But this is yet the case in eGovernment, eHealth and eShopping, world-wide. The same was thought about smart cards before they became generally available and useful when banks issued them.}, keywords = {Access to Information, Europe, Humans, Internet, Medical Records Systems, Computerized}, issn = {0926-9630}, author = {Quantin, Catherine and Allaert, Fran{\c c}ois Andr{\'e} and Fassa, Maniane and Riandey, Beno{\^\i}t and Avillach, Paul and Cohen, Olivier} } @article {292951, title = {Interoperability issues regarding patient identification in Europe.}, journal = {Conf Proc IEEE Eng Med Biol Soc}, volume = {2007}, year = {2007}, month = {2007}, pages = {6161}, keywords = {Computer Security, Confidentiality, Database Management Systems, Europe, Information Dissemination, Medical Informatics, Medical Records Systems, Computerized, Patient Identification Systems, Systems Integration}, issn = {1557-170X}, doi = {10.1109/IEMBS.2007.4353760}, author = {Quantin, C and Allaert, F- A and Fassa, M and Avillach, P and Fieschi, M and Cohen, O} } @article {292931, title = {A model for indexing medical documents combining statistical and symbolic knowledge.}, journal = {AMIA Annu Symp Proc}, year = {2007}, month = {2007}, pages = {31-5}, abstract = {OBJECTIVES: To develop and evaluate an information processing method based on terminologies, in order to index medical documents in any given documentary context. METHODS: We designed a model using both symbolic general knowledge extracted from the Unified Medical Language System (UMLS) and statistical knowledge extracted from a domain of application. Using statistical knowledge allowed us to contextualize the general knowledge for every particular situation. For each document studied, the extracted terms are ranked to highlight the most significant ones. The model was tested on a set of 17,079 French standardized discharge summaries (SDSs). RESULTS: The most important ICD-10 term of each SDS was ranked 1st or 2nd by the method in nearly 90\% of the cases. CONCLUSIONS: The use of several terminologies leads to more precise indexing. The improvement achieved in the models implementation performances as a result of using semantic relationships is encouraging.}, keywords = {Abstracting and Indexing as Topic, Humans, Information Storage and Retrieval, International Classification of Diseases, Medical Records, Natural Language Processing, Patient Discharge, Statistics as Topic, Unified Medical Language System, Vocabulary, Controlled}, issn = {1942-597X}, author = {Avillach, Paul and Joubert, Michel and Fieschi, Marius} } @article {292956, title = {Proposal of a French health identification number interoperable at the European level.}, journal = {Stud Health Technol Inform}, volume = {129}, number = {Pt 1}, year = {2007}, month = {2007}, pages = {503-7}, abstract = {The French ministry of Health is setting up the Personal Medical Record (PMR). This innovative tool has long been expected by French Health Authorities, Associations of Patients, other Health{\textquoteright}s associations, those defending Individual Liberties and the French National Data Protection Authority. The PMR will lead to improvements in many areas such as Diagnosis (Research and monitoring) Healthcare (Management of emergencies, urgent situations, Temporal health monitoring and evaluation), Therapy (Cohorts of patients for Clinical trials and epidemiological studies). The PMR will foster safe healthcare management, clinical research and epidemiological studies. Nevertheless, it raises many important questions regarding duplicates and the quality, precision and coherence of the linkage with other health data coming from different sources. The currently planned identifying process raises many questions with regard to its ability to deal with potential duplicates and to perform data linkage with other health data sources. Through this article, using the electronic health records, we develop and propose an identification process to improve the French PMR. Our proposed unique patient identifier will guarantee the security, confidentiality and privacy of the personal data, and will prove to be particularly useful for health planning, health policies and research as well as clinical and epidemiological studies. Finally, it will certainly be interoperable with other European health information systems. We propose here an alternative identification procedure that would allow France to broaden the scope of its PMR project by making it possible to contribute to public health research and policy while increasing interoperability with European health information systems and preserving the confidentiality of the data.}, keywords = {Access to Information, Computer Security, Confidentiality, Europe, France, Humans, Medical Records, Medical Records Systems, Computerized, Patient Identification Systems}, issn = {0926-9630}, author = {Quantin, Catherine and Allaert, Fran{\c c}ois-Andr{\'e} and Avillach, Paul and Riandey, Beno{\^\i}t and Fieschi, Marius and Fassa, Maniane and Cohen, Olivier} }