<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article article-type="article-commentary" dtd-version="2.0" xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMU</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Mhealth Uhealth</journal-id>
      <journal-title>JMIR mHealth and uHealth</journal-title>
      <issn pub-type="epub">2291-5222</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v12i1e57978</article-id>
      <article-id pub-id-type="pmid">38688841</article-id>
      <article-id pub-id-type="doi">10.2196/57978</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Commentary</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Commentary</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>The Evaluation of Generative AI Should Include Repetition to Assess Stability</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Buis</surname>
            <given-names>Lorraine</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Zhu</surname>
            <given-names>Lingxuan</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0001-9077-408X</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Mou</surname>
            <given-names>Weiming</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0007-1089-6516</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Hong</surname>
            <given-names>Chenglin</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0009-3565-3486</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Yang</surname>
            <given-names>Tao</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0007-5246-3284</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Lai</surname>
            <given-names>Yancheng</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0004-8444-7535</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Qi</surname>
            <given-names>Chang</given-names>
          </name>
          <degrees>MEng</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0005-3840-550X</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Lin</surname>
            <given-names>Anqi</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6324-0410</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>Jian</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7217-0111</ext-link>
        </contrib>
        <contrib id="contrib9" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Luo</surname>
            <given-names>Peng</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Oncology</institution>
            <institution>Zhujiang Hospital</institution>
            <institution>Southern Medical University</institution>
            <addr-line>253 Industrial Avenue</addr-line>
            <addr-line>Guangzhou</addr-line>
            <country>China</country>
            <phone>86 020 61643888</phone>
            <email>luopeng@smu.edu.cn</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8215-2045</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Oncology</institution>
        <institution>Zhujiang Hospital</institution>
        <institution>Southern Medical University</institution>
        <addr-line>Guangzhou</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Urology</institution>
        <institution>Shanghai General Hospital</institution>
        <institution>Shanghai Jiao Tong University School of Medicine</institution>
        <addr-line>Shanghai</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Medical Oncology</institution>
        <institution>National Cancer Center/National Clinical Research Center for Cancer/Cancer Hospital</institution>
        <institution>Chinese Academy of Medical Sciences and Peking Union Medical College</institution>
        <addr-line>Beijing</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Institute of Logic and Computation</institution>
        <addr-line>TU Wien</addr-line>
        <country>Austria</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Peng Luo <email>luopeng@smu.edu.cn</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>6</day>
        <month>5</month>
        <year>2024</year>
      </pub-date>
      <volume>12</volume>
      <elocation-id>e57978</elocation-id>
      <history>
        <date date-type="received">
          <day>1</day>
          <month>3</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>30</day>
          <month>4</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Lingxuan Zhu, Weiming Mou, Chenglin Hong, Tao Yang, Yancheng Lai, Chang Qi, Anqi Lin, Jian Zhang, Peng Luo. Originally published in JMIR mHealth and uHealth (https://mhealth.jmir.org), 06.05.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR mHealth and uHealth, is properly cited. The complete bibliographic information, a link to the original publication on https://mhealth.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://mhealth.jmir.org/2024/1/e57978" xlink:type="simple"/>
      <related-article related-article-type="commentary-article" id="v12i1e51526" ext-link-type="doi" xlink:href="10.2196/51526" vol="12" page="e51526" xlink:type="simple">https://mhealth.jmir.org/2024/1/e51526/</related-article>
      <abstract>
        <p>The increasing interest in the potential applications of generative artificial intelligence (AI) models like ChatGPT in health care has prompted numerous studies to explore its performance in various medical contexts. However, evaluating ChatGPT poses unique challenges due to the inherent randomness in its responses. Unlike traditional AI models, ChatGPT generates different responses for the same input, making it imperative to assess its stability through repetition. This commentary highlights the importance of including repetition in the evaluation of ChatGPT to ensure the reliability of conclusions drawn from its performance. Similar to biological experiments, which often require multiple repetitions for validity, we argue that assessing generative AI models like ChatGPT demands a similar approach. Failure to acknowledge the impact of repetition can lead to biased conclusions and undermine the credibility of research findings. We urge researchers to incorporate appropriate repetition in their studies from the outset and transparently report their methods to enhance the robustness and reproducibility of findings in this rapidly evolving field.</p>
      </abstract>
      <kwd-group>
        <kwd>large language model</kwd>
        <kwd>generative AI</kwd>
        <kwd>ChatGPT</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>health care</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <p>Since OpenAI released ChatGPT-3.5, there has been a growing interest within the medical community regarding the prospective applications of this general pretrained model in health care [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref7">7</xref>]. Using ChatGPT as a search keyword in the PubMed database, the results show that 2075 papers discussing ChatGPT were published in 2023. As the leading journal in the field of digital medicine, JMIR Publications Inc published a total of 115 papers related to ChatGPT in the year 2023. It should be noted that this is a quick and simple search that may not comprehensively capture all relevant articles, but it provides a general reflection of the growing interest and research on ChatGPT in the medical field. For example, Gilson et al [<xref ref-type="bibr" rid="ref8">8</xref>] explored the performance of ChatGPT on the United States Medical Licensing Examination (USMLE) step 1 and step 2 exams, discovering that ChatGPT’s performance exceeded the passing score for third-year medical students in step 1. More studies are exploring ChatGPT’s performance on other medical exams, such as the Japanese and German Medical Licensing Examinations [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>], the Otolaryngology-Head and Neck Surgery Certification Examinations [<xref ref-type="bibr" rid="ref11">11</xref>], and the UK Standardized Admission Tests [<xref ref-type="bibr" rid="ref12">12</xref>]. Beyond examinations, many articles have discussed the potential applications of ChatGPT in medicine from various perspectives. Shao et al [<xref ref-type="bibr" rid="ref13">13</xref>] examined the suitability of using ChatGPT for perioperative patient education in thoracic surgery within English and Chinese contexts. Cheng et al [<xref ref-type="bibr" rid="ref14">14</xref>] investigated whether ChatGPT could be used to generate summaries for medical research, and Hsu et al [<xref ref-type="bibr" rid="ref15">15</xref>] evaluated whether ChatGPT could correctly answer basic medication consultation questions. However, we would like to point out that as a relatively new technology, there are some differences in evaluating the potential application of generative artificial intelligence (AI) like ChatGPT in health care that require additional attention from researchers.</p>
    <p>The most significant difference affecting the evaluation of ChatGPT compared to traditional AI models known to people is the randomness inherent in the responses generated by ChatGPT. Common perception holds that for a given input, an AI model should produce the same output consistently each time. However, for natural language models like ChatGPT, this is not the case. ChatGPT generates a response by predicting the next most likely word, followed by each subsequent word. The process of generating responses involves a certain degree of randomness. If you access ChatGPT using the application programming interface, you can also control the degree of randomness in the generated responses with the temperature parameter. Even with the same input, the responses provided by ChatGPT will not be the same, and sometimes may even be completely contradictory. Therefore, when evaluating ChatGPT’s performance, it is necessary to generate multiple responses to the same input and assess these responses collectively to explore ChatGPT’s performance accurately; otherwise, there is a high likelihood of drawing biased conclusions. For example, as one of the earliest studies published, Sarraju et al [<xref ref-type="bibr" rid="ref4">4</xref>] asked the same question three times and assessed whether the three responses given by ChatGPT to the same question were consistent. As OpenAI made the ChatGPT application programming interface accessible, it became feasible to ask the same question many more times. In a recent study investigating whether ChatGPT’s peer-review conclusions are influenced by the reputation of the author’s institution, von Wedel et al [<xref ref-type="bibr" rid="ref16">16</xref>] conducted 250 repeated experiments for each question to mitigate the effects of ChatGPT’s randomness. However, not all researchers have recognized this aspect. For instance, in a study where ChatGPT was asked to answer the American Heart Association Basic Life Support and Advanced Cardiovascular Life Support exams, they found that ChatGPT could not pass either examination [<xref ref-type="bibr" rid="ref17">17</xref>]. However, that study only asked the question once without repeating, which means that the randomness of ChatGPT could have had an impact on the experiment, affecting the reliability of the conclusions. In another improved study, researchers acknowledged the impact of ChatGPT’s randomness, asking each question three times. Compared to earlier results, ChatGPT’s performance in this study significantly improved, and it could pass the Basic Life Support exam [<xref ref-type="bibr" rid="ref18">18</xref>], further underscoring the importance of repetitions. Therefore, it is inappropriate to evaluate ChatGPT’s performance based on a single response if one aims to draw rigorous, scientifically meaningful conclusions. Just as biological experiments typically require three repetitions for validity, without repetition, it becomes challenging to determine whether the observed phenomenon is an inherent characteristic of the model or merely a random occurrence. Additionally, for models intended for clinical practice applications, whether for patient education, diagnosis, or support in clinical documentation writing, we hope that ChatGPT can always provide correct and harmless responses. Repetition also allows us to evaluate the model’s stability and further assess its application value. However, we noticed that many recent manuscripts we reviewed were not aware of this, thus affecting the reliability of the conclusions.</p>
    <p>Therefore, in research on the application of generative AI like ChatGPT in health care, appropriate repetition should be included to comprehensively evaluate the model’s performance by assessing the stability of the model in the task set by the author. This should be considered from the beginning of the research. Since models like ChatGPT will continue to be upgraded, if the authors only realize the need for repetition when revising the manuscript, there will be a considerable time gap between the authors’ supplementary analysis and the original analysis. The model has likely been upgraded during this period, introducing new uncertainties into the research. Alternatively, the authors need to completely redo the analysis from scratch during the manuscript revision process, wasting time and effort. Therefore, we hope that future researchers will recognize the necessity of repeated experiments from the start and report in the manuscript how the repetition was carried out in the study [<xref ref-type="bibr" rid="ref19">19</xref>].</p>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">USMLE</term>
          <def>
            <p>United States Medical Licensing Examination</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Grünebaum</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chervenak</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Pollet</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Katz</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chervenak</surname>
              <given-names>FA</given-names>
            </name>
          </person-group>
          <article-title>The exciting potential for ChatGPT in obstetrics and gynecology</article-title>
          <source>Am J Obstet Gynecol</source>
          <year>2023</year>
          <month>06</month>
          <volume>228</volume>
          <issue>6</issue>
          <fpage>696</fpage>
          <lpage>705</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ajog.2023.03.009</pub-id>
          <pub-id pub-id-type="medline">36924907</pub-id>
          <pub-id pub-id-type="pii">S0002-9378(23)00154-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Howard</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hope</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Gerada</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT and antimicrobial advice: the end of the consulting infection doctor?</article-title>
          <source>Lancet Infect Dis</source>
          <year>2023</year>
          <month>04</month>
          <volume>23</volume>
          <issue>4</issue>
          <fpage>405</fpage>
          <lpage>406</lpage>
          <pub-id pub-id-type="doi">10.1016/S1473-3099(23)00113-5</pub-id>
          <pub-id pub-id-type="medline">36822213</pub-id>
          <pub-id pub-id-type="pii">S1473-3099(23)00113-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Mou</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Potential of large language models as tools against medical disinformation</article-title>
          <source>JAMA Intern Med</source>
          <year>2024</year>
          <month>04</month>
          <day>01</day>
          <volume>184</volume>
          <issue>4</issue>
          <fpage>450</fpage>
          <pub-id pub-id-type="doi">10.1001/jamainternmed.2024.0020</pub-id>
          <pub-id pub-id-type="medline">38407861</pub-id>
          <pub-id pub-id-type="pii">2815262</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sarraju</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bruemmer</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Van Iterson</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Rodriguez</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Laffin</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Appropriateness of cardiovascular disease prevention recommendations obtained from a popular online chat-based artificial intelligence model</article-title>
          <source>JAMA</source>
          <year>2023</year>
          <month>03</month>
          <day>14</day>
          <volume>329</volume>
          <issue>10</issue>
          <fpage>842</fpage>
          <lpage>844</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36735264"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jama.2023.1044</pub-id>
          <pub-id pub-id-type="medline">36735264</pub-id>
          <pub-id pub-id-type="pii">2801244</pub-id>
          <pub-id pub-id-type="pmcid">PMC10015303</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Mou</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Can the ChatGPT and other large language models with internet-connected database solve the questions and concerns of patient with prostate cancer and help democratize medical knowledge?</article-title>
          <source>J Transl Med</source>
          <year>2023</year>
          <month>04</month>
          <day>19</day>
          <volume>21</volume>
          <issue>1</issue>
          <fpage>269</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://translational-medicine.biomedcentral.com/articles/10.1186/s12967-023-04123-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12967-023-04123-5</pub-id>
          <pub-id pub-id-type="medline">37076876</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12967-023-04123-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC10115367</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ali</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Dobbs</surname>
              <given-names>TD</given-names>
            </name>
            <name name-style="western">
              <surname>Hutchings</surname>
              <given-names>HA</given-names>
            </name>
            <name name-style="western">
              <surname>Whitaker</surname>
              <given-names>IS</given-names>
            </name>
          </person-group>
          <article-title>Using ChatGPT to write patient clinic letters</article-title>
          <source>Lancet Digit Health</source>
          <year>2023</year>
          <month>04</month>
          <volume>5</volume>
          <issue>4</issue>
          <fpage>e179</fpage>
          <lpage>e181</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2589-7500(23)00048-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/S2589-7500(23)00048-1</pub-id>
          <pub-id pub-id-type="medline">36894409</pub-id>
          <pub-id pub-id-type="pii">S2589-7500(23)00048-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>SB</given-names>
            </name>
            <name name-style="western">
              <surname>Lam</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT: the future of discharge summaries?</article-title>
          <source>Lancet Digit Health</source>
          <year>2023</year>
          <month>03</month>
          <volume>5</volume>
          <issue>3</issue>
          <fpage>e107</fpage>
          <lpage>e108</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2589-7500(23)00021-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/S2589-7500(23)00021-3</pub-id>
          <pub-id pub-id-type="medline">36754724</pub-id>
          <pub-id pub-id-type="pii">S2589-7500(23)00021-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gilson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Safranek</surname>
              <given-names>CW</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Socrates</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Taylor</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Chartash</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>How does ChatGPT perform on the United States Medical Licensing Examination (USMLE)? The implications of large language models for medical education and knowledge assessment</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>02</month>
          <day>08</day>
          <volume>9</volume>
          <fpage>e45312</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e45312/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/45312</pub-id>
          <pub-id pub-id-type="medline">36753318</pub-id>
          <pub-id pub-id-type="pii">v9i1e45312</pub-id>
          <pub-id pub-id-type="pmcid">PMC9947764</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Meyer</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Riese</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Streichert</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Comparison of the performance of GPT-3.5 and GPT-4 with that of medical students on the written German Medical Licensing Examination: observational study</article-title>
          <source>JMIR Med Educ</source>
          <year>2024</year>
          <month>02</month>
          <day>08</day>
          <volume>10</volume>
          <fpage>e50965</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2024//e50965/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/50965</pub-id>
          <pub-id pub-id-type="medline">38329802</pub-id>
          <pub-id pub-id-type="pii">v10i1e50965</pub-id>
          <pub-id pub-id-type="pmcid">PMC10884900</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yanagita</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yokokawa</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Uchida</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tawara</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ikusaka</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Accuracy of ChatGPT on medical questions in the National Medical Licensing Examination in Japan: evaluation study</article-title>
          <source>JMIR Form Res</source>
          <year>2023</year>
          <month>10</month>
          <day>13</day>
          <volume>7</volume>
          <fpage>e48023</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://formative.jmir.org/2023//e48023/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48023</pub-id>
          <pub-id pub-id-type="medline">37831496</pub-id>
          <pub-id pub-id-type="pii">v7i1e48023</pub-id>
          <pub-id pub-id-type="pmcid">PMC10612006</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Long</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lowe</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Santos</surname>
              <given-names>AD</given-names>
            </name>
            <name name-style="western">
              <surname>Alanazi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>O'Brien</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Wright</surname>
              <given-names>ED</given-names>
            </name>
            <name name-style="western">
              <surname>Cote</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>A novel evaluation model for assessing ChatGPT on Otolaryngology-Head and Neck Surgery Certification Examinations: performance study</article-title>
          <source>JMIR Med Educ</source>
          <year>2024</year>
          <month>01</month>
          <day>16</day>
          <volume>10</volume>
          <fpage>e49970</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2024//e49970/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/49970</pub-id>
          <pub-id pub-id-type="medline">38227351</pub-id>
          <pub-id pub-id-type="pii">v10i1e49970</pub-id>
          <pub-id pub-id-type="pmcid">PMC10828939</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Giannos</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Delardas</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on UK standardized admission tests: insights from the BMAT, TMUA, LNAT, and TSA examinations</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>04</month>
          <day>26</day>
          <volume>9</volume>
          <fpage>e47737</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e47737/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/47737</pub-id>
          <pub-id pub-id-type="medline">37099373</pub-id>
          <pub-id pub-id-type="pii">v9i1e47737</pub-id>
          <pub-id pub-id-type="pmcid">PMC10173042</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Appropriateness and comprehensiveness of using ChatGPT for perioperative patient education in thoracic surgery in different language contexts: survey study</article-title>
          <source>Interact J Med Res</source>
          <year>2023</year>
          <month>08</month>
          <day>14</day>
          <volume>12</volume>
          <fpage>e46900</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.i-jmr.org/2023//e46900/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/46900</pub-id>
          <pub-id pub-id-type="medline">37578819</pub-id>
          <pub-id pub-id-type="pii">v12i1e46900</pub-id>
          <pub-id pub-id-type="pmcid">PMC10463083</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tsai</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Bai</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ko</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Tsai</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tseng</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Su</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Comparisons of quality, correctness, and similarity between ChatGPT-generated and human-written abstracts for basic research: cross-sectional study</article-title>
          <source>J Med Internet Res</source>
          <year>2023</year>
          <month>12</month>
          <day>25</day>
          <volume>25</volume>
          <fpage>e51229</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2023//e51229/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/51229</pub-id>
          <pub-id pub-id-type="medline">38145486</pub-id>
          <pub-id pub-id-type="pii">v25i1e51229</pub-id>
          <pub-id pub-id-type="pmcid">PMC10760418</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hou</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hsieh</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Examining real-world medication consultations and drug-herb interactions: ChatGPT performance evaluation</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>08</month>
          <day>21</day>
          <volume>9</volume>
          <fpage>e48433</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e48433/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48433</pub-id>
          <pub-id pub-id-type="medline">37561097</pub-id>
          <pub-id pub-id-type="pii">v9i1e48433</pub-id>
          <pub-id pub-id-type="pmcid">PMC10477918</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>von Wedel</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Schmitt</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Thiele</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Leuner</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Shay</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Redaelli</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schaefer</surname>
              <given-names>MS</given-names>
            </name>
          </person-group>
          <article-title>Affiliation bias in peer review of abstracts by a large language model</article-title>
          <source>JAMA</source>
          <year>2024</year>
          <month>01</month>
          <day>16</day>
          <volume>331</volume>
          <issue>3</issue>
          <fpage>252</fpage>
          <lpage>253</lpage>
          <pub-id pub-id-type="doi">10.1001/jama.2023.24641</pub-id>
          <pub-id pub-id-type="medline">38150261</pub-id>
          <pub-id pub-id-type="pii">2813511</pub-id>
          <pub-id pub-id-type="pmcid">PMC10753437</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fijačko</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Gosak</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Štiglic</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Picard</surname>
              <given-names>CT</given-names>
            </name>
            <name name-style="western">
              <surname>John Douma</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Can ChatGPT pass the life support exams without entering the American Heart Association course?</article-title>
          <source>Resuscitation</source>
          <year>2023</year>
          <month>04</month>
          <volume>185</volume>
          <fpage>109732</fpage>
          <pub-id pub-id-type="doi">10.1016/j.resuscitation.2023.109732</pub-id>
          <pub-id pub-id-type="medline">36775020</pub-id>
          <pub-id pub-id-type="pii">S0300-9572(23)00045-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Mou</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT can pass the AHA exams: open-ended questions outperform multiple-choice format</article-title>
          <source>Resuscitation</source>
          <year>2023</year>
          <month>07</month>
          <volume>188</volume>
          <fpage>109783</fpage>
          <pub-id pub-id-type="doi">10.1016/j.resuscitation.2023.109783</pub-id>
          <pub-id pub-id-type="medline">37349064</pub-id>
          <pub-id pub-id-type="pii">S0300-9572(23)00096-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Mou</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>STAGER checklist: Standardized Testing and Assessment Guidelines for Evaluating Generative AI Reliability</article-title>
          <source>arXiv. Preprint posted online on Decemeber 8, 2023</source>
          <year>2024</year>
          <pub-id pub-id-type="doi">10.48550/arXiv.2312.10074</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
